diff --git a/CMakeLists.txt b/CMakeLists.txt index 78d5e0eb6..ead63bff8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 17) +set(OpenBLAS_PATCH_VERSION 18) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index da56c0758..ebe52ea8a 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -147,5 +147,6 @@ In chronological order: * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 -* [Your name or handle] <[email or website]> - * [Date] [Brief summary of your changes] +* theoractice + * [2016-03-20] Fix compiler error in VisualStudio with CMake + * [2016-03-22] Fix access violation on Windows while static linking diff --git a/Changelog.txt b/Changelog.txt index c59166c38..7f82e8e88 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.18 +12-Apr-2016 +common: + * If you set MAKE_NB_JOBS flag less or equal than zero, + make will be without -j. + +x86/x86_64: + * Support building Visual Studio static library. (#813, Thanks, theoractice) + * Fix bugs to pass buidbot CI tests (http://build.openblas.net) + +ARM: + * Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K) + +POWER: + * Optimize S and C BLAS3 on Power8 + * Optimize BLAS2/1 on Power8 + ==================================================================== Version 0.2.17 20-Mar-2016 diff --git a/Makefile.rule b/Makefile.rule index 0758a48a8..d8db6102c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.17 +VERSION = 0.2.18 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -112,7 +112,10 @@ NO_AFFINITY = 1 # NO_PARALLEL_MAKE = 1 # Force number of make jobs. The default is the number of logical CPU of the host. -# This is particularly useful when using distcc +# This is particularly useful when using distcc. +# A negative value will disable adding a -j flag to make, allowing to use a parent +# make -j value. This is useful to call OpenBLAS make from an other project +# makefile # MAKE_NB_JOBS = 2 # If you would like to know minute performance report of GotoBLAS. diff --git a/appveyor.yml b/appveyor.yml index 172a49b42..5360a9ef9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.2.15.{build} +version: 0.2.18.{build} #environment: diff --git a/benchmark/Makefile b/benchmark/Makefile index 11d3c5bec..8166f3863 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread # Apple vecLib LIBVECLIB = -framework Accelerate +ESSL=/opt/ibm/lib +#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a + ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ @@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -253,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ endif - +essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ + cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ @@ -306,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX) slinpack.veclib : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +slinpack.essl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dlinpack #################################################### dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -322,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX) dlinpack.veclib : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dlinpack.essl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Clinpack #################################################### clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) @@ -339,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX) clinpack.veclib : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +clinpack.essl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zlinpack #################################################### zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) @@ -356,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX) zlinpack.veclib : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zlinpack.essl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Scholesky ################################################### scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) @@ -441,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX) sgemm.veclib : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgemm.essl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgemm #################################################### dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -457,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX) dgemm.veclib : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgemm.essl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemm #################################################### cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) @@ -474,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX) cgemm.veclib : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemm.essl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemm #################################################### zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) @@ -491,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX) zgemm.veclib : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemm.essl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssymm #################################################### ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -573,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX) strmm.veclib : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strmm.essl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrmm #################################################### dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -589,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX) dtrmm.veclib : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrmm.essl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrmm #################################################### ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) @@ -606,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX) ctrmm.veclib : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrmm.essl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrmm #################################################### ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) @@ -623,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX) ztrmm.veclib : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrmm.essl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Strsm #################################################### strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1413,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX) zdot.veclib : zdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srot.acml : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.atlas : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.mkl : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.veclib : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drot.acml : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.atlas : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.mkl : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.veclib : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2124,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c zgesv.$(SUFFIX) : gesv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + @@ -2137,7 +2221,7 @@ smallscaling: smallscaling.c ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl include $(TOPDIR)/Makefile.tail diff --git a/benchmark/rot.c b/benchmark/rot.c new file mode 100644 index 000000000..32322bebb --- /dev/null +++ b/benchmark/rot.c @@ -0,0 +1,197 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define ROT BLASFUNC(drot) +#else +#define ROT BLASFUNC(srot) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + // FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + FLOAT c[1] = { 2.0 }; + FLOAT s[1] = { 2.0 }; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0 printf("MAKE += -j %d\n", MAKE_NB_JOBS); + #else + // Let make use parent -j argument or -j1 if there + // is no make parent + #endif #elif NO_PARALLEL_MAKE==1 printf("MAKE += -j 1\n"); #else diff --git a/getarch_2nd.c b/getarch_2nd.c index fad647fed..cf9c578cb 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -64,10 +64,13 @@ int main(int argc, char **argv) { if ((argc >= 2) && (*argv[1] == '1')) { + +#if defined(ARCH_X86) || defined(ARCH_X86_64) printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); +#endif #ifdef USE64BITINT printf("#define USE64BITINT\n"); diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S index 7a70264ca..7f2ddea07 100644 --- a/kernel/arm64/cgemm_kernel_4x4.S +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // for next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] ld2 {v4.4s, v5.4s} , [pA] // for next round add pA, pA, #32 - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] prfm PLDL1KEEP, [pA, #512] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v6.4s, v7.4s} , [ppA] // for next round add ppA, ppA, #32 - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] prfm PLDL1KEEP, [ppA, #512] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // for next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] ld2 {v0.4s, v1.4s}, [pA] // for next round add pA, pA, #32 - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] prfm PLDL1KEEP, [ppA, #512] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] ld2 {v2.4s, v3.4s}, [ppA] // for next round add ppA, ppA, #32 - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] prfm PLDL1KEEP, [pB, #512] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_SUB @@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro SAVE8x4 @@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S old mode 100755 new mode 100644 index 40b98cee2..d58cef52d --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 @@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 @@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm @@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm @@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.2s[0] - OP_ii v18.4s, v3.4s, v9.2s[0] - OP_ri v19.4s, v2.4s, v9.2s[0] - OP_ir v19.4s, v3.4s, v8.2s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.2s[1] - OP_ii v22.4s, v3.4s, v9.2s[1] - OP_ri v23.4s, v2.4s, v9.2s[1] - OP_ir v23.4s, v3.4s, v8.2s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 @@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v8.4s[1] - OP_ri v17.4s, v0.4s, v8.4s[1] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v8.s[1] + OP_ri v17.4s, v0.4s, v8.s[1] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v8.4s[1] - OP_ri v19.4s, v2.4s, v8.4s[1] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v8.s[1] + OP_ri v19.4s, v2.4s, v8.s[1] + OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S index be0e9bdef..3de27257a 100644 --- a/kernel/arm64/ctrmm_kernel_4x4.S +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S old mode 100755 new mode 100644 index 3131541d4..ce5cb0406 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 @@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 @@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm @@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm @@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.2s[0] - OP_ii v18.4s, v3.4s, v9.2s[0] - OP_ri v19.4s, v2.4s, v9.2s[0] - OP_ir v19.4s, v3.4s, v8.2s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.2s[1] - OP_ii v22.4s, v3.4s, v9.2s[1] - OP_ri v23.4s, v2.4s, v9.2s[1] - OP_ir v23.4s, v3.4s, v8.2s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 @@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v8.4s[1] - OP_ri v17.4s, v0.4s, v8.4s[1] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v8.s[1] + OP_ri v17.4s, v0.4s, v8.s[1] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v8.4s[1] - OP_ri v19.4s, v2.4s, v8.4s[1] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v8.s[1] + OP_ri v19.4s, v2.4s, v8.s[1] + OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S index e2ad11492..44b0f7ff2 100644 --- a/kernel/arm64/dgemm_kernel_4x4.S +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp q0, q1, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v11.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmul v20.2d, v0.2d, v9.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] + fmul v20.2d, v0.2d, v9.d[0] + fmul v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmul v18.2d, v2.2d, v8.2d[0] - fmul v31.2d, v3.2d, v11.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + fmul v31.2d, v3.2d, v11.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] - fmul v22.2d, v2.2d, v9.2d[0] - fmul v27.2d, v3.2d, v10.2d[0] + fmul v22.2d, v2.2d, v9.d[0] + fmul v27.2d, v3.2d, v10.d[0] ldp d12, d13, [pB] add pB, pB, #16 - fmul v24.2d, v0.2d, v10.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + fmul v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] // for next round add pA, pA, #32 - fmul v26.2d, v2.2d, v10.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + fmul v23.2d, v3.2d, v9.d[0] ldp q6, q7, [ppA] // for next round add ppA, ppA, #32 - fmul v28.2d, v0.2d, v11.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v11.d[0] + fmul v17.2d, v1.2d, v8.d[0] ldp d14, d15, [pB] add pB, pB, #16 - fmul v30.2d, v2.2d, v11.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + fmul v30.2d, v2.2d, v11.d[0] + fmul v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v15.d[0] ldp d8, d9, [pB] add pB, pB, #16 - fmla v18.2d, v6.2d, v12.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v31.2d, v7.2d, v15.d[0] ldp d10, d11, [pB] add pB, pB, #16 - fmla v20.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] ldp q0, q1, [pA] add pA, pA, #32 - fmla v26.2d, v6.2d, v14.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v17.2d, v5.2d, v12.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v30.2d, v6.2d, v15.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v19.2d, v7.2d, v12.d[0] .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v11.d[0] ldp d12, d13, [pB] add pB, pB, #16 - fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v31.2d, v3.2d, v11.d[0] ldp d14, d15, [pB] add pB, pB, #16 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v20.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v22.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] add pA, pA, #32 - fmla v26.2d, v2.2d, v10.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v23.2d, v3.2d, v9.d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v11.d[0] + fmla v17.2d, v1.2d, v8.d[0] ldp q6, q7, [ppA] add ppA, ppA, #32 - fmla v30.2d, v2.2d, v11.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v27.2d, v7.2d, v14.d[0] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v31.2d, v7.2d, v15.d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v19.2d, v7.2d, v12.d[0] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v30.2d, v6.2d, v15.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v23.2d, v7.2d, v13.d[0] .endm .macro KERNEL8x4_SUB @@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp q0, q1, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v20.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v10.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v24.2d, v0.2d, v10.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v28.2d, v0.2d, v11.d[0] + fmla v17.2d, v1.2d, v8.d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v31.2d, v3.2d, v11.d[0] + fmla v22.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v10.d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v23.2d, v3.2d, v9.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x4 @@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S old mode 100755 new mode 100644 index 88e9a773d..b04dbb5d5 --- a/kernel/arm64/dgemm_kernel_4x8.S +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v0.2d, v8.2d[1] - fmul v19.2d, v1.2d, v8.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v0.2d, v8.d[1] + fmul v19.2d, v1.2d, v8.d[1] - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v0.2d, v9.2d[1] - fmul v23.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v9.d[0] + fmul v22.2d, v0.2d, v9.d[1] + fmul v23.2d, v1.2d, v9.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] - fmul v26.2d, v0.2d, v10.2d[1] - fmul v27.2d, v1.2d, v10.2d[1] + fmul v24.2d, v0.2d, v10.d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v26.2d, v0.2d, v10.d[1] + fmul v27.2d, v1.2d, v10.d[1] - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] - fmul v30.2d, v0.2d, v11.2d[1] - fmul v31.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v11.d[0] + fmul v29.2d, v1.2d, v11.d[0] + fmul v30.2d, v0.2d, v11.d[1] + fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 @@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 @@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB @@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 @@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 @@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S old mode 100755 new mode 100644 index a607fecc4..f3c3d5c35 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 +// 15 pCRow3 +// 16 pA // 17 // 18 must save // 19 must save @@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] + ldp d8, d9, [pB], #16 - fmul v18.2d, v2.2d, v8.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v20.2d, v0.2d, v9.d[0] - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] + ldp d10, d11, [pB], #16 - fmul v22.2d, v2.2d, v9.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v21.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] + ldp q2, q3, [pA], #32 - fmul v26.2d, v2.2d, v10.2d[0] - fmul v27.2d, v3.2d, v10.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + fmul v28.2d, v0.2d, v11.d[0] - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] + ldp q4, q5, [pA], #32 - fmul v30.2d, v2.2d, v11.2d[0] - fmul v31.2d, v3.2d, v11.2d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v29.2d, v1.2d, v11.d[0] - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 - ldp d12, d13, [pB] - add pB, pB, #16 - ldp d14, d15, [pB] - add pB, pB, #16 + ldp d12, d13, [pB], #16 + + fmul v18.2d, v2.2d, v8.d[0] + fmul v22.2d, v2.2d, v9.d[0] + + ldp d14, d15, [pB], #16 + + fmul v26.2d, v2.2d, v10.d[0] + fmul v30.2d, v2.2d, v11.d[0] + + ldp q6, q7, [pA], #32 + + fmul v19.2d, v3.2d, v8.d[0] + fmul v27.2d, v3.2d, v10.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v31.2d, v3.2d, v11.d[0] + fmul v23.2d, v3.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] - ld1 {v4.2d}, [pA], #16 + ldp q4, q5, [pA], #32 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] - ld1 {v5.2d}, [pA], #16 + ldp d12, d13, [pB], #16 - fmla v30.2d, v2.2d, v11.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v25.2d, v1.2d, v10.d[0] - ldp d12, d13, [pB] - add pB, pB, #16 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v29.2d, v1.2d, v11.d[0] - ldp d14, d15, [pB] - add pB, pB, #16 + ldp d14, d15, [pB], #16 - fmla v18.2d, v2.2d, v8.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] - ld1 {v6.2d}, [pA], #16 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] + fmla v23.2d, v3.2d, v9.d[0] - ld1 {v7.2d}, [pA], #16 + ldp q6, q7, [pA], #32 - fmla v22.2d, v2.2d, v9.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] - - prfm PLDL1KEEP, [pA, #224] - prfm PLDL1KEEP, [pA, #224+64] + fmla v27.2d, v3.2d, v10.d[0] + fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] - ld1 {v0.2d}, [pA], #16 + ldp q0, q1, [pA], #32 - fmla v20.2d, v4.2d, v13.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] - ld1 {v1.2d}, [pA], #16 + ldp d8, d9, [pB], #16 - fmla v30.2d, v6.2d, v15.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] - ldp d8, d9, [pB] - add pB, pB, #16 + ldp d10, d11, [pB], #16 - fmla v28.2d, v4.2d, v15.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] - ldp d10, d11, [pB] - add pB, pB, #16 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - ld1 {v2.2d}, [pA], #16 + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + ldp q2, q3, [pA], #32 - ld1 {v3.2d}, [pA], #16 - - fmla v18.2d, v6.2d, v12.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - - prfm PLDL1KEEP, [pB, #640] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v6.2d, v15.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] + + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + ldp d8, d9, [pB], #16 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + ldp d10, d11, [pB], #16 - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + fmla v25.2d, v1.2d, v10.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v19.2d, v3.2d, v8.d[0] + fmla v27.2d, v3.2d, v10.d[0] + + fmla v31.2d, v3.2d, v11.d[0] + fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 fmov alpha0, alpha - ld1 {v0.2d, v1.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow0] + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld1 {v2.2d, v3.2d}, [pCRow0] + ldp q2, q3, [pCRow0] fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow0] + stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 - ld1 {v4.2d, v5.2d}, [pCRow1] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow1] + stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow1] + ldp q6, q7, [pCRow1] fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow1] + stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 - ld1 {v0.2d, v1.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q0, q1, [pCRow2] fmla v0.2d, v24.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow2] + stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v2.2d, v3.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q2, q3, [pCRow2] fmla v2.2d, v26.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow2] + stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v4.2d, v5.2d}, [pCRow3] + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.2d, v28.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow3] + stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow3] + ldp q6, q7, [pCRow3] fmla v6.2d, v30.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow3] + stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 - - prfm PLDL2KEEP, [pCRow0, #128] - prfm PLDL2KEEP, [pCRow1, #128] - prfm PLDL2KEEP, [pCRow2, #128] - prfm PLDL2KEEP, [pCRow3, #128] .endm /******************************************************************************/ @@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 - fmla v9.2d, v25.2d, alphaV1 + fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV2 - fmla v13.2d, v29.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1, pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d}, [pCRow2] - fmla v8.2d, v24.2d, alphaV2 + fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 - fmla v5.2d, v21.2d, alphaV1 - fmla v6.2d, v22.2d, alphaV2 - fmla v7.2d, v23.2d, alphaV3 + fmla v5.2d, v21.2d, alphaV0 + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1 , pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 + fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] @@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN: add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN + .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB @@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20: subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a - .align 5 + .align 5 dgemm_kernel_L4_M8_22: KERNEL8x4_M1 @@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22: subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 - + .align 5 dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 @@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a: b dgemm_kernel_L4_M8_44 + .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 @@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44: ands counterL , origK, #7 ble dgemm_kernel_L4_M8_100 + .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB @@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46: bne dgemm_kernel_L4_M8_46 dgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4 diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S index 0d1b12881..34fb8c233 100644 --- a/kernel/arm64/dtrmm_kernel_4x4.S +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S old mode 100755 new mode 100644 index eb7397faa..4aecf28eb --- a/kernel/arm64/dtrmm_kernel_4x8.S +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v0.2d, v8.2d[1] - fmul v19.2d, v1.2d, v8.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v0.2d, v8.d[1] + fmul v19.2d, v1.2d, v8.d[1] - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v0.2d, v9.2d[1] - fmul v23.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v9.d[0] + fmul v22.2d, v0.2d, v9.d[1] + fmul v23.2d, v1.2d, v9.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] - fmul v26.2d, v0.2d, v10.2d[1] - fmul v27.2d, v1.2d, v10.2d[1] + fmul v24.2d, v0.2d, v10.d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v26.2d, v0.2d, v10.d[1] + fmul v27.2d, v1.2d, v10.d[1] - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] - fmul v30.2d, v0.2d, v11.2d[1] - fmul v31.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v11.d[0] + fmul v29.2d, v1.2d, v11.d[0] + fmul v30.2d, v0.2d, v11.d[1] + fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 @@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 @@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S old mode 100755 new mode 100644 index 6890505bd..b06c7560d --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v2.2d, v8.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v2.2d, v8.d[0] + fmul v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v21.2d, v1.2d, v8.2d[1] - fmul v22.2d, v2.2d, v8.2d[1] - fmul v23.2d, v3.2d, v8.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + fmul v21.2d, v1.2d, v8.d[1] + fmul v22.2d, v2.2d, v8.d[1] + fmul v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v25.2d, v1.2d, v9.2d[0] - fmul v26.2d, v2.2d, v9.2d[0] - fmul v27.2d, v3.2d, v9.2d[0] + fmul v24.2d, v0.2d, v9.d[0] + fmul v25.2d, v1.2d, v9.d[0] + fmul v26.2d, v2.2d, v9.d[0] + fmul v27.2d, v3.2d, v9.d[0] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v29.2d, v1.2d, v9.2d[1] - fmul v30.2d, v2.2d, v9.2d[1] - fmul v31.2d, v3.2d, v9.2d[1] + fmul v28.2d, v0.2d, v9.d[1] + fmul v29.2d, v1.2d, v9.d[1] + fmul v30.2d, v2.2d, v9.d[1] + fmul v31.2d, v3.2d, v9.d[1] ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v9.2d[0] + fmla v24.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v9.d[0] + fmla v26.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v9.d[0] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v29.2d, v1.2d, v9.2d[1] - fmla v30.2d, v2.2d, v9.2d[1] - fmla v31.2d, v3.2d, v9.2d[1] + fmla v28.2d, v0.2d, v9.d[1] + fmla v29.2d, v1.2d, v9.d[1] + fmla v30.2d, v2.2d, v9.d[1] + fmla v31.2d, v3.2d, v9.d[1] ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v19.2d, v7.2d, v12.d[0] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v21.2d, v5.2d, v12.2d[1] - fmla v22.2d, v6.2d, v12.2d[1] - fmla v23.2d, v7.2d, v12.2d[1] + fmla v20.2d, v4.2d, v12.d[1] + fmla v21.2d, v5.2d, v12.d[1] + fmla v22.2d, v6.2d, v12.d[1] + fmla v23.2d, v7.2d, v12.d[1] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v13.2d[0] + fmla v24.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v13.d[0] + fmla v26.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v13.d[0] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v29.2d, v5.2d, v13.2d[1] - fmla v30.2d, v6.2d, v13.2d[1] - fmla v31.2d, v7.2d, v13.2d[1] + fmla v28.2d, v4.2d, v13.d[1] + fmla v29.2d, v5.2d, v13.d[1] + fmla v30.2d, v6.2d, v13.d[1] + fmla v31.2d, v7.2d, v13.d[1] ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v19.2d, v7.2d, v12.d[0] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v21.2d, v5.2d, v12.2d[1] - fmla v22.2d, v6.2d, v12.2d[1] - fmla v23.2d, v7.2d, v12.2d[1] + fmla v20.2d, v4.2d, v12.d[1] + fmla v21.2d, v5.2d, v12.d[1] + fmla v22.2d, v6.2d, v12.d[1] + fmla v23.2d, v7.2d, v12.d[1] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v13.2d[0] + fmla v24.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v13.d[0] + fmla v26.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v13.d[0] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v29.2d, v5.2d, v13.2d[1] - fmla v30.2d, v6.2d, v13.2d[1] - fmla v31.2d, v7.2d, v13.2d[1] + fmla v28.2d, v4.2d, v13.d[1] + fmla v29.2d, v5.2d, v13.d[1] + fmla v30.2d, v6.2d, v13.d[1] + fmla v31.2d, v7.2d, v13.d[1] .endm .macro KERNEL8x4_SUB @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v9.2d[0] + fmla v24.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v9.d[0] + fmla v26.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v9.d[0] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v29.2d, v1.2d, v9.2d[1] - fmla v30.2d, v2.2d, v9.2d[1] - fmla v31.2d, v3.2d, v9.2d[1] + fmla v28.2d, v0.2d, v9.d[1] + fmla v29.2d, v1.2d, v9.d[1] + fmla v30.2d, v2.2d, v9.d[1] + fmla v31.2d, v3.2d, v9.d[1] .endm .macro SAVE8x4 @@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 @@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 @@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 22b55b01c..68366d9f2 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v18.4s, v2.4s, v8.2s[0] - fmul v19.4s, v3.4s, v8.2s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v18.4s, v2.4s, v8.s[0] + fmul v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v22.4s, v2.4s, v8.2s[1] - fmul v23.4s, v3.4s, v8.2s[1] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v22.4s, v2.4s, v8.s[1] + fmul v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v26.4s, v2.4s, v9.2s[0] - fmul v27.4s, v3.4s, v9.2s[0] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v26.4s, v2.4s, v9.s[0] + fmul v27.4s, v3.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] - fmul v30.4s, v2.4s, v9.2s[1] - fmul v31.4s, v3.4s, v9.2s[1] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] + fmul v30.4s, v2.4s, v9.s[1] + fmul v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] .endm .macro KERNEL16x4_SUB @@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] .endm .macro SAVE16x4 @@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 @@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 @@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index bfa80d589..a5cf7baff 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 - fmul v16.4s, v0.4s, v8.4s[0] - fmul v20.4s, v0.4s, v8.4s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 - fmul v24.4s, v0.4s, v8.4s[2] - fmul v28.4s, v0.4s, v8.4s[3] + fmul v24.4s, v0.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 - fmul v17.4s, v2.4s, v8.4s[0] - fmul v21.4s, v2.4s, v8.4s[1] + fmul v17.4s, v2.4s, v8.s[0] + fmul v21.4s, v2.4s, v8.s[1] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 - fmul v25.4s, v2.4s, v8.4s[2] - fmul v29.4s, v2.4s, v8.4s[3] + fmul v25.4s, v2.4s, v8.s[2] + fmul v29.4s, v2.4s, v8.s[3] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 - fmul v18.4s, v4.4s, v8.4s[0] - fmul v19.4s, v6.4s, v8.4s[0] + fmul v18.4s, v4.4s, v8.s[0] + fmul v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmul v22.4s, v4.4s, v8.4s[1] - fmul v23.4s, v6.4s, v8.4s[1] + fmul v22.4s, v4.4s, v8.s[1] + fmul v23.4s, v6.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmul v26.4s, v4.4s, v8.4s[2] - fmul v27.4s, v6.4s, v8.4s[2] + fmul v26.4s, v4.4s, v8.s[2] + fmul v27.4s, v6.4s, v8.s[2] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmul v30.4s, v4.4s, v8.4s[3] - fmul v31.4s, v6.4s, v8.4s[3] + fmul v30.4s, v4.4s, v8.s[3] + fmul v31.4s, v6.4s, v8.s[3] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 .endm .macro KERNEL16x4_M2 - fmla v16.4s, v1.4s, v12.4s[0] - fmla v17.4s, v3.4s, v12.4s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v17.4s, v3.4s, v12.s[0] ld1 {v8.4s}, [pB] // for next round add pB, pB, #16 - fmla v18.4s, v5.4s, v12.4s[0] - fmla v19.4s, v7.4s, v12.4s[0] + fmla v18.4s, v5.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] ld1 {v0.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmla v20.4s, v1.4s, v12.4s[1] - fmla v21.4s, v3.4s, v12.4s[1] + fmla v20.4s, v1.4s, v12.s[1] + fmla v21.4s, v3.4s, v12.s[1] ld1 {v2.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmla v22.4s, v5.4s, v12.4s[1] - fmla v23.4s, v7.4s, v12.4s[1] + fmla v22.4s, v5.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] ld1 {v4.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmla v24.4s, v1.4s, v12.4s[2] - fmla v25.4s, v3.4s, v12.4s[2] + fmla v24.4s, v1.4s, v12.s[2] + fmla v25.4s, v3.4s, v12.s[2] ld1 {v6.4s}, [pA_3] // for next round add pA_3, pA_3, #16 - fmla v26.4s, v5.4s, v12.4s[2] - fmla v27.4s, v7.4s, v12.4s[2] + fmla v26.4s, v5.4s, v12.s[2] + fmla v27.4s, v7.4s, v12.s[2] prfm PLDL1KEEP, [pA_2, #512] - fmla v28.4s, v1.4s, v12.4s[3] - fmla v29.4s, v3.4s, v12.4s[3] + fmla v28.4s, v1.4s, v12.s[3] + fmla v29.4s, v3.4s, v12.s[3] prfm PLDL1KEEP, [pA_3, #512] - fmla v30.4s, v5.4s, v12.4s[3] - fmla v31.4s, v7.4s, v12.4s[3] + fmla v30.4s, v5.4s, v12.s[3] + fmla v31.4s, v7.4s, v12.s[3] prfm PLDL1KEEP, [pB, #512] .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.4s[0] - fmla v17.4s, v2.4s, v8.4s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v2.4s, v8.s[0] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 - fmla v18.4s, v4.4s, v8.4s[0] - fmla v19.4s, v6.4s, v8.4s[0] + fmla v18.4s, v4.4s, v8.s[0] + fmla v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmla v20.4s, v0.4s, v8.4s[1] - fmla v21.4s, v2.4s, v8.4s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v2.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmla v22.4s, v4.4s, v8.4s[1] - fmla v23.4s, v6.4s, v8.4s[1] + fmla v22.4s, v4.4s, v8.s[1] + fmla v23.4s, v6.4s, v8.s[1] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmla v24.4s, v0.4s, v8.4s[2] - fmla v25.4s, v2.4s, v8.4s[2] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v2.4s, v8.s[2] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 - fmla v26.4s, v4.4s, v8.4s[2] - fmla v27.4s, v6.4s, v8.4s[2] + fmla v26.4s, v4.4s, v8.s[2] + fmla v27.4s, v6.4s, v8.s[2] prfm PLDL1KEEP, [pA_0, #512] - fmla v28.4s, v0.4s, v8.4s[3] - fmla v29.4s, v2.4s, v8.4s[3] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v2.4s, v8.s[3] prfm PLDL1KEEP, [pA_1, #512] - fmla v30.4s, v4.4s, v8.4s[3] - fmla v31.4s, v6.4s, v8.4s[3] + fmla v30.4s, v4.4s, v8.s[3] + fmla v31.4s, v6.4s, v8.s[3] .endm .macro KERNEL16x4_E - fmla v16.4s, v1.4s, v12.4s[0] - fmla v17.4s, v3.4s, v12.4s[0] - fmla v18.4s, v5.4s, v12.4s[0] - fmla v19.4s, v7.4s, v12.4s[0] - fmla v20.4s, v1.4s, v12.4s[1] - fmla v21.4s, v3.4s, v12.4s[1] - fmla v22.4s, v5.4s, v12.4s[1] - fmla v23.4s, v7.4s, v12.4s[1] - fmla v24.4s, v1.4s, v12.4s[2] - fmla v25.4s, v3.4s, v12.4s[2] - fmla v26.4s, v5.4s, v12.4s[2] - fmla v27.4s, v7.4s, v12.4s[2] - fmla v28.4s, v1.4s, v12.4s[3] - fmla v29.4s, v3.4s, v12.4s[3] - fmla v30.4s, v5.4s, v12.4s[3] - fmla v31.4s, v7.4s, v12.4s[3] + fmla v16.4s, v1.4s, v12.s[0] + fmla v17.4s, v3.4s, v12.s[0] + fmla v18.4s, v5.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] + fmla v20.4s, v1.4s, v12.s[1] + fmla v21.4s, v3.4s, v12.s[1] + fmla v22.4s, v5.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] + fmla v24.4s, v1.4s, v12.s[2] + fmla v25.4s, v3.4s, v12.s[2] + fmla v26.4s, v5.4s, v12.s[2] + fmla v27.4s, v7.4s, v12.s[2] + fmla v28.4s, v1.4s, v12.s[3] + fmla v29.4s, v3.4s, v12.s[3] + fmla v30.4s, v5.4s, v12.s[3] + fmla v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL16x4_SUB @@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.4s, v0.4s, v8.4s[0] - fmla v20.4s, v0.4s, v8.4s[1] - fmla v24.4s, v0.4s, v8.4s[2] - fmla v28.4s, v0.4s, v8.4s[3] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 - fmla v17.4s, v2.4s, v8.4s[0] - fmla v21.4s, v2.4s, v8.4s[1] - fmla v25.4s, v2.4s, v8.4s[2] - fmla v29.4s, v2.4s, v8.4s[3] + fmla v17.4s, v2.4s, v8.s[0] + fmla v21.4s, v2.4s, v8.s[1] + fmla v25.4s, v2.4s, v8.s[2] + fmla v29.4s, v2.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 - fmla v18.4s, v4.4s, v8.4s[0] - fmla v22.4s, v4.4s, v8.4s[1] - fmla v26.4s, v4.4s, v8.4s[2] - fmla v30.4s, v4.4s, v8.4s[3] + fmla v18.4s, v4.4s, v8.s[0] + fmla v22.4s, v4.4s, v8.s[1] + fmla v26.4s, v4.4s, v8.s[2] + fmla v30.4s, v4.4s, v8.s[3] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 - fmla v19.4s, v6.4s, v8.4s[0] - fmla v23.4s, v6.4s, v8.4s[1] - fmla v27.4s, v6.4s, v8.4s[2] - fmla v31.4s, v6.4s, v8.4s[3] + fmla v19.4s, v6.4s, v8.s[0] + fmla v23.4s, v6.4s, v8.s[1] + fmla v27.4s, v6.4s, v8.s[2] + fmla v31.4s, v6.4s, v8.s[3] .endm .macro SAVE16x4 @@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v2.2s, v3.2s}, [pA_1] add pA_1, pA_1, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] - fmla v18.2s, v2.2s, v8.2s[0] - fmla v31.2s, v3.2s, v9.2s[1] - fmla v22.2s, v2.2s, v8.2s[1] - fmla v27.2s, v3.2s, v9.2s[0] + fmla v18.2s, v2.2s, v8.s[0] + fmla v31.2s, v3.2s, v9.s[1] + fmla v22.2s, v2.2s, v8.s[1] + fmla v27.2s, v3.2s, v9.s[0] - fmla v26.2s, v2.2s, v9.2s[0] - fmla v23.2s, v3.2s, v8.2s[1] - fmla v30.2s, v2.2s, v9.2s[1] - fmla v19.2s, v3.2s, v8.2s[0] + fmla v26.2s, v2.2s, v9.s[0] + fmla v23.2s, v3.2s, v8.s[1] + fmla v30.2s, v2.2s, v9.s[1] + fmla v19.2s, v3.2s, v8.s[0] .endm .macro SAVE8x4 @@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA_0] add pA_0, pA_0, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0 , pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0 , pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S index ac690e4d4..bd47bed31 100644 --- a/kernel/arm64/sgemm_kernel_8x8.S +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v17.4s, v1.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v19.4s, v1.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v21.4s, v1.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v23.4s, v1.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v25.4s, v1.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v27.4s, v1.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v29.4s, v1.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] - fmul v31.4s, v1.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v17.4s, v1.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB @@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 @@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB @@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 @@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v4.4s[0] - fmla v18.2s, v0.2s, v4.4s[1] - fmla v20.2s, v0.2s, v4.4s[2] - fmla v22.2s, v0.2s, v4.4s[3] - fmla v24.2s, v0.2s, v5.4s[0] - fmla v26.2s, v0.2s, v5.4s[1] - fmla v28.2s, v0.2s, v5.4s[2] - fmla v30.2s, v0.2s, v5.4s[3] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 @@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA, pA, #4 - fmla s16, s0, v4.4s[0] - fmla s18, s0, v4.4s[1] - fmla s20, s0, v4.4s[2] - fmla s22, s0, v4.4s[3] - fmla s24, s0, v5.4s[0] - fmla s26, s0, v5.4s[1] - fmla s28, s0, v5.4s[2] - fmla s30, s0, v5.4s[3] + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 @@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S old mode 100755 new mode 100644 index b99760a03..28b321651 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v18.4s, v2.4s, v8.2s[0] - fmul v19.4s, v3.4s, v8.2s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v18.4s, v2.4s, v8.s[0] + fmul v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v22.4s, v2.4s, v8.2s[1] - fmul v23.4s, v3.4s, v8.2s[1] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v22.4s, v2.4s, v8.s[1] + fmul v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v26.4s, v2.4s, v9.2s[0] - fmul v27.4s, v3.4s, v9.2s[0] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v26.4s, v2.4s, v9.s[0] + fmul v27.4s, v3.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] - fmul v30.4s, v2.4s, v9.2s[1] - fmul v31.4s, v3.4s, v9.2s[1] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] + fmul v30.4s, v2.4s, v9.s[1] + fmul v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] .endm .macro KERNEL16x4_SUB @@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] .endm .macro SAVE16x4 @@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 @@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 @@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S index 674e200d8..eeb3e6e72 100644 --- a/kernel/arm64/strmm_kernel_4x4.S +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S old mode 100755 new mode 100644 index 98b912934..843f0c890 --- a/kernel/arm64/strmm_kernel_8x8.S +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v17.4s, v1.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v19.4s, v1.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v21.4s, v1.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v23.4s, v1.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v25.4s, v1.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v27.4s, v1.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v29.4s, v1.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] - fmul v31.4s, v1.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v17.4s, v1.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB @@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 @@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 @@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v4.4s[0] - fmla v18.2s, v0.2s, v4.4s[1] - fmla v20.2s, v0.2s, v4.4s[2] - fmla v22.2s, v0.2s, v4.4s[3] - fmla v24.2s, v0.2s, v5.4s[0] - fmla v26.2s, v0.2s, v5.4s[1] - fmla v28.2s, v0.2s, v5.4s[2] - fmla v30.2s, v0.2s, v5.4s[3] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 @@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA, pA, #4 - fmla s16, s0, v4.4s[0] - fmla s18, s0, v4.4s[1] - fmla s20, s0, v4.4s[2] - fmla s22, s0, v4.4s[3] - fmla s24, s0, v5.4s[0] - fmla s26, s0, v5.4s[1] - fmla s28, s0, v5.4s[2] - fmla s30, s0, v5.4s[3] + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 @@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 28ce3de40..1cb695e56 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.2d, v0.2d, v9.2d[0] + fmls v17.2d, v0.2d, v9.d[0] #else - fmul v17.2d, v0.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.d[0] #endif - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.2d[0] + fmls v19.2d, v2.2d, v9.d[0] #else - fmul v19.2d, v2.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.d[0] #endif - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.2d, v0.2d, v9.2d[1] + fmls v21.2d, v0.2d, v9.d[1] #else - fmul v21.2d, v0.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.d[1] #endif - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.2d, v2.2d, v9.2d[1] + fmls v23.2d, v2.2d, v9.d[1] #else - fmul v23.2d, v2.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.d[1] #endif - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.2d, v0.2d, v11.2d[0] + fmls v25.2d, v0.2d, v11.d[0] #else - fmul v25.2d, v0.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.d[0] #endif - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - fmul v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.2d, v2.2d, v11.2d[0] + fmls v27.2d, v2.2d, v11.d[0] #else - fmul v27.2d, v2.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.d[0] #endif - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - fmul v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.2d, v0.2d, v11.2d[1] + fmls v29.2d, v0.2d, v11.d[1] #else - fmul v29.2d, v0.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.d[1] #endif - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - fmul v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.2d, v2.2d, v11.2d[1] + fmls v31.2d, v2.2d, v11.d[1] #else - fmul v31.2d, v2.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.d[1] #endif - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_ir v31.2d, v3.2d, v10.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v14.2d, v15.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v4.2d, v5.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v10.2d, v11.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB @@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 @@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 @@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] - OP_rr d24, d0, v10.2d[0] - OP_ii d24, d1, v11.2d[0] - OP_ri d25, d0, v11.2d[0] - OP_ir d25, d1, v10.2d[0] + OP_rr d24, d0, v10.d[0] + OP_ii d24, d1, v11.d[0] + OP_ri d25, d0, v11.d[0] + OP_ir d25, d1, v10.d[0] - OP_rr d28, d0, v10.2d[1] - OP_ii d28, d1, v11.2d[1] - OP_ri d29, d0, v11.2d[1] - OP_ir d29, d1, v10.2d[1] + OP_rr d28, d0, v10.d[1] + OP_ii d28, d1, v11.d[1] + OP_ri d29, d0, v11.d[1] + OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 @@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 @@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 @@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 3ff8227e3..7945870d6 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.2d, v0.2d, v9.2d[0] + fmls v17.2d, v0.2d, v9.d[0] #else - fmul v17.2d, v0.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.d[0] #endif - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.2d[0] + fmls v19.2d, v2.2d, v9.d[0] #else - fmul v19.2d, v2.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.d[0] #endif - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.2d, v0.2d, v9.2d[1] + fmls v21.2d, v0.2d, v9.d[1] #else - fmul v21.2d, v0.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.d[1] #endif - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.2d, v2.2d, v9.2d[1] + fmls v23.2d, v2.2d, v9.d[1] #else - fmul v23.2d, v2.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.d[1] #endif - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.2d, v0.2d, v11.2d[0] + fmls v25.2d, v0.2d, v11.d[0] #else - fmul v25.2d, v0.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.d[0] #endif - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - fmul v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.2d, v2.2d, v11.2d[0] + fmls v27.2d, v2.2d, v11.d[0] #else - fmul v27.2d, v2.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.d[0] #endif - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - fmul v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.2d, v0.2d, v11.2d[1] + fmls v29.2d, v0.2d, v11.d[1] #else - fmul v29.2d, v0.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.d[1] #endif - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - fmul v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.2d, v2.2d, v11.2d[1] + fmls v31.2d, v2.2d, v11.d[1] #else - fmul v31.2d, v2.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.d[1] #endif - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_ir v31.2d, v3.2d, v10.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v14.2d, v15.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v4.2d, v5.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v10.2d, v11.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB @@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 @@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 @@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] - OP_rr d24, d0, v10.2d[0] - OP_ii d24, d1, v11.2d[0] - OP_ri d25, d0, v11.2d[0] - OP_ir d25, d1, v10.2d[0] + OP_rr d24, d0, v10.d[0] + OP_ii d24, d1, v11.d[0] + OP_ri d25, d0, v11.d[0] + OP_ir d25, d1, v10.d[0] - OP_rr d28, d0, v10.2d[1] - OP_ii d28, d1, v11.2d[1] - OP_ri d29, d0, v11.2d[1] - OP_ir d29, d1, v10.2d[1] + OP_rr d28, d0, v10.d[1] + OP_ii d28, d1, v11.d[1] + OP_ri d29, d0, v11.d[1] + OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 @@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 @@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 @@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 760d568cd..b37a4213b 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -3,14 +3,18 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = gemm_kernel_power6.S +STRMMKERNEL = strmm_kernel_16x8_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = gemm_kernel_power6.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o @@ -24,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -97,56 +105,56 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #ISMINKERNEL = ../arm/imin.c #IDMINKERNEL = ../arm/imin.c # -#SASUMKERNEL = ../arm/asum.c -#DASUMKERNEL = ../arm/asum.c -#CASUMKERNEL = ../arm/zasum.c -#ZASUMKERNEL = ../arm/zasum.c +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c # #SAXPYKERNEL = ../arm/axpy.c -#DAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = daxpy.c #CAXPYKERNEL = ../arm/zaxpy.c -#ZAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = zaxpy.c # -#SCOPYKERNEL = ../arm/copy.c -#DCOPYKERNEL = ../arm/copy.c -#CCOPYKERNEL = ../arm/zcopy.c -#ZCOPYKERNEL = ../arm/zcopy.c +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c # -#SDOTKERNEL = ../arm/dot.c -#DDOTKERNEL = ../arm/dot.c +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c #CDOTKERNEL = ../arm/zdot.c -#ZDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = zdot.c # #SNRM2KERNEL = ../arm/nrm2.c #DNRM2KERNEL = ../arm/nrm2.c #CNRM2KERNEL = ../arm/znrm2.c #ZNRM2KERNEL = ../arm/znrm2.c # -#SROTKERNEL = ../arm/rot.c -#DROTKERNEL = ../arm/rot.c +SROTKERNEL = srot.c +DROTKERNEL = drot.c #CROTKERNEL = ../arm/zrot.c #ZROTKERNEL = ../arm/zrot.c # -#SSCALKERNEL = ../arm/scal.c -#DSCALKERNEL = ../arm/scal.c +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c #CSCALKERNEL = ../arm/zscal.c -#ZSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = zscal.c # -#SSWAPKERNEL = ../arm/swap.c -#DSWAPKERNEL = ../arm/swap.c -#CSWAPKERNEL = ../arm/zswap.c -#ZSWAPKERNEL = ../arm/zswap.c +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c # #SGEMVNKERNEL = ../arm/gemv_n.c -#DGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = dgemv_n.c #CGEMVNKERNEL = ../arm/zgemv_n.c #ZGEMVNKERNEL = ../arm/zgemv_n.c # #SGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c -#ZGEMVTKERNEL = ../arm/zgemv_t.c +#ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c diff --git a/kernel/power/casum.c b/kernel/power/casum.c new file mode 100644 index 000000000..aeed0ca78 --- /dev/null +++ b/kernel/power/casum.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "casum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=4; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + svec[2] = 0.0; + svec[3] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + FLOAT svec[4] __attribute__ ((aligned (16)));; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + casum_kernel_16(n1, x, svec); + sumf = svec[0] + svec[1]+svec[2]+svec[3]; + i=n1; + ip = 2 * n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip += 2; + i++; + } + + } + else + { + inc_x2 = 2 * inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip += inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c new file mode 100644 index 000000000..cb50234ce --- /dev/null +++ b/kernel/power/casum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "addic. %0 , %0 , -16 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + + "stxvw4x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c new file mode 100644 index 000000000..ce7d67475 --- /dev/null +++ b/kernel/power/ccopy.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "ccopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + ccopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c new file mode 100644 index 000000000..95b3559ba --- /dev/null +++ b/kernel/power/ccopy_microk_power8.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 50, 0, %2 \n\t" + "lxvw4x 51, %5, %2 \n\t" + "lxvw4x 52, %6, %2 \n\t" + "lxvw4x 53, %7, %2 \n\t" + "lxvw4x 54, %8, %2 \n\t" + "lxvw4x 55, %9, %2 \n\t" + "lxvw4x 56, %10, %2 \n\t" + "lxvw4x 57, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "stxvw4x 50, 0, %1 \n\t" + "stxvw4x 51, %5, %1 \n\t" + "lxvw4x 50, 0, %2 \n\t" + "lxvw4x 51, %5, %2 \n\t" + "stxvw4x 52, %6, %1 \n\t" + "stxvw4x 53, %7, %1 \n\t" + "lxvw4x 52, %6, %2 \n\t" + "lxvw4x 53, %7, %2 \n\t" + "stxvw4x 54, %8, %1 \n\t" + "stxvw4x 55, %9, %1 \n\t" + "lxvw4x 54, %8, %2 \n\t" + "lxvw4x 55, %9, %2 \n\t" + "stxvw4x 56, %10, %1 \n\t" + "stxvw4x 57, %11, %1 \n\t" + "lxvw4x 56, %10, %2 \n\t" + "lxvw4x 57, %11, %2 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvw4x 50, 0, %1 \n\t" + "stxvw4x 51, %5, %1 \n\t" + "stxvw4x 52, %6, %1 \n\t" + "stxvw4x 53, %7, %1 \n\t" + "stxvw4x 54, %8, %1 \n\t" + "stxvw4x 55, %9, %1 \n\t" + "stxvw4x 56, %10, %1 \n\t" + "stxvw4x 57, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S new file mode 100644 index 000000000..0c462ce8e --- /dev/null +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -0,0 +1,407 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 32000 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 + +#define alpha_dr vs28 +#define alpha_di vs29 +#define alpha_sr vs30 +#define alpha_si vs31 + +#define FRAMEPOINTER r12 + +#define BBUFFER r14 +#define L r15 +#define o12 r16 +#define o4 r17 +#define T2 r19 +#define BBO r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) +#endif + + stfs f1, ALPHA_R_SP + stfs f2, ALPHA_I_SP + // stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) + lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) + lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) +#else + lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) +#else + lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "cgemm_macros_8x4_power8.S" + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 384 + li o4 , 4 + li o8 , 8 + li o12 , 12 + li o16 , 16 + li o32 , 32 + li o48 , 48 + + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 + + +#ifdef __64BIT__ + addi T1 , SP, 296 +#else + addi T1 , SP, 224 +#endif + + stxsspx vs1, 0, T1 + lxsspx alpha_dr, 0, T1 + stxsspx vs2, o8 , T1 + lxsspx alpha_di, o8, T1 + addi T1, SP, 360 + li T2, 0 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_dr, o12, T1 + lxvw4x alpha_sr, o0 , T1 + addi T1, T1, 16 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_di, o12, T1 + lxvw4x alpha_si, o0 , T1 + + .align 5 + +#include "cgemm_logic_8x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) +#endif + + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S new file mode 100644 index 000000000..db2a57f91 --- /dev/null +++ b/kernel/power/cgemm_logic_8x4_power8.S @@ -0,0 +1,1459 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 2 + ble CGEMM_L4_END + +CGEMM_L4_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 3 + +CGEMM_L4_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L4_COPYB + + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 3 + ble CGEMM_L4x8_END + +CGEMM_L4x8_BEGIN: + + + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L4x8_SUB4 + +CGEMM_L4x8_LOOP_START: + + dcbt AO, PRE + dcbt BO, PRE + LOAD4x8_1 + dcbt BO, PRE + KERNEL4x8_I1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + + addic. L, L, -2 + ble CGEMM_L4x8_LOOP_END + + .align 5 + +CGEMM_L4x8_LOOP: + + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + + addic. L, L, -1 + bgt CGEMM_L4x8_LOOP + +CGEMM_L4x8_LOOP_END: + + dcbt BO, PRE + KERNEL4x8_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b CGEMM_L4x8_SUB1 + +CGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b CGEMM_L4x8_SUB1 + +CGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + +CGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble CGEMM_L4x8_SAVE + +CGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt CGEMM_L4x8_SUB2 + +CGEMM_L4x8_SAVE: + + SAVE4x8 + + addic. I, I, -1 + bgt CGEMM_L4x8_BEGIN + +CGEMM_L4x8_END: + +CGEMM_L4x4_BEGIN: + + andi. T2, M, 7 + ble CGEMM_L4x1_END + + andi. T1, M, 4 + ble CGEMM_L4x4_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L4x4_SUB4 + +CGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble CGEMM_L4x4_LOOP_END + + .align 5 + +CGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt CGEMM_L4x4_LOOP + +CGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b CGEMM_L4x4_SUB1 + +CGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b CGEMM_L4x4_SUB1 + +CGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + +CGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble CGEMM_L4x4_SAVE + +CGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt CGEMM_L4x4_SUB2 + +CGEMM_L4x4_SAVE: + + SAVE4x4 + +CGEMM_L4x4_END: + +CGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble CGEMM_L4x2_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L4x2_SUB4 + +CGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble CGEMM_L4x2_LOOP_END + + .align 5 + +CGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt CGEMM_L4x2_LOOP + +CGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b CGEMM_L4x2_SUB1 + +CGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b CGEMM_L4x2_SUB1 + +CGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + +CGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble CGEMM_L4x2_SAVE + +CGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt CGEMM_L4x2_SUB2 + +CGEMM_L4x2_SAVE: + + SAVE4x2 + +CGEMM_L4x2_END: + +CGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble CGEMM_L4x1_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L4x1_SUB4 + +CGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble CGEMM_L4x1_LOOP_END + + .align 5 + +CGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt CGEMM_L4x1_LOOP + +CGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b CGEMM_L4x1_SUB1 + +CGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b CGEMM_L4x1_SUB1 + +CGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + +CGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble CGEMM_L4x1_SAVE + +CGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt CGEMM_L4x1_SUB2 + +CGEMM_L4x1_SAVE: + + SAVE4x1 + +CGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt CGEMM_L4_BEGIN + + andi. T2, N, 3 + ble L999_H2 + +CGEMM_L4_END: + + b CGEMM_L2_BEGIN + +L999_H1: + + b L999_H2 + +CGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 2 + +CGEMM_L2_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L2_COPYB + + + andi. T1, N, 2 + ble CGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble CGEMM_L2x8_END + +CGEMM_L2x8_BEGIN: + + + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L2x8_SUB4 + +CGEMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble CGEMM_L2x8_LOOP_END + + .align 5 + +CGEMM_L2x8_LOOP: + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt CGEMM_L2x8_LOOP + +CGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b CGEMM_L2x8_SUB1 + +CGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b CGEMM_L2x8_SUB1 + +CGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + +CGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble CGEMM_L2x8_SAVE + +CGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt CGEMM_L2x8_SUB2 + +CGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt CGEMM_L2x8_BEGIN + +CGEMM_L2x8_END: + +CGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble CGEMM_L2x1_END + + andi. T1, M, 4 + ble CGEMM_L2x4_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L2x4_SUB4 + +CGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble CGEMM_L2x4_LOOP_END + + .align 5 + +CGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt CGEMM_L2x4_LOOP + +CGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b CGEMM_L2x4_SUB1 + +CGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b CGEMM_L2x4_SUB1 + +CGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + +CGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble CGEMM_L2x4_SAVE + +CGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt CGEMM_L2x4_SUB2 + +CGEMM_L2x4_SAVE: + + SAVE2x4 + +CGEMM_L2x4_END: + +CGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble CGEMM_L2x2_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L2x2_SUB4 + +CGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble CGEMM_L2x2_LOOP_END + + .align 5 + +CGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt CGEMM_L2x2_LOOP + +CGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b CGEMM_L2x2_SUB1 + +CGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b CGEMM_L2x2_SUB1 + +CGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + +CGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble CGEMM_L2x2_SAVE + +CGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt CGEMM_L2x2_SUB2 + +CGEMM_L2x2_SAVE: + + SAVE2x2 + +CGEMM_L2x2_END: + +CGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble CGEMM_L2x1_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L2x1_SUB4 + +CGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble CGEMM_L2x1_LOOP_END + + .align 5 + +CGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt CGEMM_L2x1_LOOP + +CGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b CGEMM_L2x1_SUB1 + +CGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b CGEMM_L2x1_SUB1 + +CGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + +CGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble CGEMM_L2x1_SAVE + +CGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt CGEMM_L2x1_SUB2 + +CGEMM_L2x1_SAVE: + + SAVE2x1 + +CGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +CGEMM_L2_END: + + b CGEMM_L1_BEGIN + +L999_H2: + + b L999 + +CGEMM_L1_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +CGEMM_L1_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L1_COPYB + + + andi. T1, N, 1 + ble CGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 3 + ble CGEMM_L1x8_END + +CGEMM_L1x8_BEGIN: + + + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L1x8_SUB4 + +CGEMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble CGEMM_L1x8_LOOP_END + + .align 5 + +CGEMM_L1x8_LOOP: + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt CGEMM_L1x8_LOOP + +CGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b CGEMM_L1x8_SUB1 + +CGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b CGEMM_L1x8_SUB1 + +CGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + +CGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble CGEMM_L1x8_SAVE + +CGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt CGEMM_L1x8_SUB2 + +CGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt CGEMM_L1x8_BEGIN + +CGEMM_L1x8_END: + +CGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble CGEMM_L1x1_END + + andi. T1, M, 4 + ble CGEMM_L1x4_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L1x4_SUB4 + +CGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble CGEMM_L1x4_LOOP_END + + .align 5 + +CGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt CGEMM_L1x4_LOOP + +CGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b CGEMM_L1x4_SUB1 + +CGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b CGEMM_L1x4_SUB1 + +CGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + +CGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble CGEMM_L1x4_SAVE + +CGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt CGEMM_L1x4_SUB2 + +CGEMM_L1x4_SAVE: + + SAVE1x4 + +CGEMM_L1x4_END: + +CGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble CGEMM_L1x2_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L1x2_SUB4 + +CGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble CGEMM_L1x2_LOOP_END + + .align 5 + +CGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt CGEMM_L1x2_LOOP + +CGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b CGEMM_L1x2_SUB1 + +CGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b CGEMM_L1x2_SUB1 + +CGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + +CGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble CGEMM_L1x2_SAVE + +CGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt CGEMM_L1x2_SUB2 + +CGEMM_L1x2_SAVE: + + SAVE1x2 + +CGEMM_L1x2_END: + +CGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble CGEMM_L1x1_END + mr BO, BBUFFER + srawi. L, K, 3 + ble CGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble CGEMM_L1x1_SUB4 + +CGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble CGEMM_L1x1_LOOP_END + + .align 5 + +CGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt CGEMM_L1x1_LOOP + +CGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b CGEMM_L1x1_SUB1 + +CGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b CGEMM_L1x1_SUB1 + +CGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + +CGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble CGEMM_L1x1_SAVE + +CGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt CGEMM_L1x1_SUB2 + +CGEMM_L1x1_SAVE: + + SAVE1x1 + +CGEMM_L1x1_END: + +CGEMM_L1_END: diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S new file mode 100644 index 000000000..9a18cb189 --- /dev/null +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -0,0 +1,6355 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvaddsp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvaddsp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvsubsp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvsubsp + +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x8 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs48, 0 + xxspltw vs9, vs48, 1 + xxspltw vs10, vs48, 2 + xxspltw vs11, vs48, 3 + + + xxspltw vs12, vs49, 0 + xxspltw vs13, vs49, 1 + xxspltw vs14, vs49, 2 + xxspltw vs15, vs49, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs50, 0 + xxspltw vs9, vs50, 1 + xxspltw vs10, vs50, 2 + xxspltw vs11, vs50, 3 + + + xxspltw vs12, vs51, 0 + xxspltw vs13, vs51, 1 + xxspltw vs14, vs51, 2 + xxspltw vs15, vs51, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs52, 0 + xxspltw vs9, vs52, 1 + xxspltw vs10, vs52, 2 + xxspltw vs11, vs52, 3 + + + xxspltw vs12, vs53, 0 + xxspltw vs13, vs53, 1 + xxspltw vs14, vs53, 2 + xxspltw vs15, vs53, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs54, 0 + xxspltw vs9, vs54, 1 + xxspltw vs10, vs54, 2 + xxspltw vs11, vs54, 3 + + + xxspltw vs12, vs55, 0 + xxspltw vs13, vs55, 1 + xxspltw vs14, vs55, 2 + xxspltw vs15, vs55, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs56, 0 + xxspltw vs9, vs56, 1 + xxspltw vs10, vs56, 2 + xxspltw vs11, vs56, 3 + + + xxspltw vs12, vs57, 0 + xxspltw vs13, vs57, 1 + xxspltw vs14, vs57, 2 + xxspltw vs15, vs57, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs58, 0 + xxspltw vs9, vs58, 1 + xxspltw vs10, vs58, 2 + xxspltw vs11, vs58, 3 + + + xxspltw vs12, vs59, 0 + xxspltw vs13, vs59, 1 + xxspltw vs14, vs59, 2 + xxspltw vs15, vs59, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs60, 0 + xxspltw vs9, vs60, 1 + xxspltw vs10, vs60, 2 + xxspltw vs11, vs60, 3 + + + xxspltw vs12, vs61, 0 + xxspltw vs13, vs61, 1 + xxspltw vs14, vs61, 2 + xxspltw vs15, vs61, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs62, 0 + xxspltw vs9, vs62, 1 + xxspltw vs10, vs62, 2 + xxspltw vs11, vs62, 3 + + + xxspltw vs12, vs63, 0 + xxspltw vs13, vs63, 1 + xxspltw vs14, vs63, 2 + xxspltw vs15, vs63, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x4 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL4x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x2 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxsspx vs20, o0, BO // load b2_r + lxsspx vs21, o16, BO // load b2_i + lxsspx vs22, o32, BO // load b3_r + lxsspx vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + lxsspx vs20, o0, BO // load b2_r + lxsspx vs21, o16, BO // load b2_i + lxsspx vs22, o32, BO // load b3_r + lxsspx vs23, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i + + addi BO, BO, 64 + + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro SAVE4x1 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x8 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x4 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL2x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x2 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i + + addi BO, BO, 64 + + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro SAVE2x1 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x8 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + lxvw4x vs5, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + lxvw4x vs1, o16, AO // load a2, a3 + + addi AO, AO, 32 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x4 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL1x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + addi AO, AO, 16 + + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x2 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro SAVE1x1 + + mr T1, CO + xxlxor vs24, vs24, vs24 + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c new file mode 100644 index 000000000..da97c896e --- /dev/null +++ b/kernel/power/cswap.c @@ -0,0 +1,175 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "cswap_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_32 + +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + cswap_kernel_32(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c new file mode 100644 index 000000000..90ab59c54 --- /dev/null +++ b/kernel/power/cswap_microk_power8.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -4 \n\t" + "addi %4, %4, -4 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvw4x 32, 0, %2 \n\t" + "lxvw4x 33, %5, %2 \n\t" + "lxvw4x 34, %6, %2 \n\t" + "lxvw4x 35, %7, %2 \n\t" + "lxvw4x 36, %8, %2 \n\t" + "lxvw4x 37, %9, %2 \n\t" + "lxvw4x 38, %10, %2 \n\t" + "lxvw4x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 48, 0, %1 \n\t" + "lxvw4x 49, %5, %1 \n\t" + "lxvw4x 50, %6, %1 \n\t" + "lxvw4x 51, %7, %1 \n\t" + "lxvw4x 52, %8, %1 \n\t" + "lxvw4x 53, %9, %1 \n\t" + "lxvw4x 54, %10, %1 \n\t" + "lxvw4x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "lxvw4x 56, 0, %1 \n\t" + "lxvw4x 57, %5, %1 \n\t" + "lxvw4x 58, %6, %1 \n\t" + "lxvw4x 59, %7, %1 \n\t" + "lxvw4x 60, %8, %1 \n\t" + "lxvw4x 61, %9, %1 \n\t" + "lxvw4x 62, %10, %1 \n\t" + "lxvw4x 63, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvw4x 32, 0, %3 \n\t" + "stxvw4x 33, %5, %3 \n\t" + "stxvw4x 34, %6, %3 \n\t" + "stxvw4x 35, %7, %3 \n\t" + "stxvw4x 36, %8, %3 \n\t" + "stxvw4x 37, %9, %3 \n\t" + "stxvw4x 38, %10, %3 \n\t" + "stxvw4x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvw4x 40, 0, %3 \n\t" + "stxvw4x 41, %5, %3 \n\t" + "stxvw4x 42, %6, %3 \n\t" + "stxvw4x 43, %7, %3 \n\t" + "stxvw4x 44, %8, %3 \n\t" + "stxvw4x 45, %9, %3 \n\t" + "stxvw4x 46, %10, %3 \n\t" + "stxvw4x 47, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvw4x 48, 0, %4 \n\t" + "stxvw4x 49, %5, %4 \n\t" + "stxvw4x 50, %6, %4 \n\t" + "stxvw4x 51, %7, %4 \n\t" + "stxvw4x 52, %8, %4 \n\t" + "stxvw4x 53, %9, %4 \n\t" + "stxvw4x 54, %10, %4 \n\t" + "stxvw4x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "stxvw4x 56, 0, %4 \n\t" + "stxvw4x 57, %5, %4 \n\t" + "stxvw4x 58, %6, %4 \n\t" + "stxvw4x 59, %7, %4 \n\t" + "stxvw4x 60, %8, %4 \n\t" + "stxvw4x 61, %9, %4 \n\t" + "stxvw4x 62, %10, %4 \n\t" + "stxvw4x 63, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S new file mode 100644 index 000000000..460a387fb --- /dev/null +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -0,0 +1,399 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 400 +#define ALPHA_R_SP 304(SP) +#define ALPHA_I_SP 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 + +#define alpha_dr vs28 +#define alpha_di vs29 +#define alpha_sr vs30 +#define alpha_si vs31 + +#define o12 r12 +#define KKK r13 +#define K1 r14 +#define L r15 +#define o16 r16 +#define NOTUSED r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o4 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) + std r12, 296(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfs f1, ALPHA_R_SP + stfs f2, ALPHA_I_SP + // stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "ctrmm_macros_8x4_power8.S" + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 384 + li o4 , 4 + li o8 , 8 + li o12 , 12 + li o16 , 16 + li o32 , 32 + li o48 , 48 + + +#ifdef __64BIT__ + addi T1, SP, 304 +#else + addi T1, SP, 224 +#endif + + lxsspx alpha_dr, 0, T1 + lxsspx alpha_di, o8, T1 + addi T1, SP, 360 + li T2, 0 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_dr, o12, T1 + lxvw4x alpha_sr, o0 , T1 + addi T1, T1, 16 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_di, o12, T1 + lxvw4x alpha_si, o0 , T1 + + .align 5 + +#include "ctrmm_logic_8x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) + ld r12, 296(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S new file mode 100644 index 000000000..9ab258501 --- /dev/null +++ b/kernel/power/ctrmm_logic_8x4_power8.S @@ -0,0 +1,1769 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 2 + ble CTRMM_L4_END + +CTRMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble CTRMM_L4x8_END + +CTRMM_L4x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L4x8_SUB4 + +CTRMM_L4x8_LOOP_START: + + dcbt AO, PRE + dcbt BO, PRE + LOAD4x8_1 + KERNEL4x8_I1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_2 + + addic. L, L, -2 + ble CTRMM_L4x8_LOOP_END + + .align 5 + +CTRMM_L4x8_LOOP: + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_2 + + addic. L, L, -1 + bgt CTRMM_L4x8_LOOP + +CTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b CTRMM_L4x8_SUB1 + +CTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b CTRMM_L4x8_SUB1 + +CTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble CTRMM_L4x8_SAVE + b CTRMM_L4x8_SUB2 + +CTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L4x8_SAVE + +CTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt CTRMM_L4x8_SUB2 + +CTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt CTRMM_L4x8_BEGIN + +CTRMM_L4x8_END: + +CTRMM_L4x4_BEGIN: + andi. T2, M, 7 + ble CTRMM_L4x1_END + + andi. T1, M, 4 + ble CTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L4x4_SUB4 + +CTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble CTRMM_L4x4_LOOP_END + + .align 5 + +CTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt CTRMM_L4x4_LOOP + +CTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b CTRMM_L4x4_SUB1 + +CTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b CTRMM_L4x4_SUB1 + +CTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble CTRMM_L4x4_SAVE + b CTRMM_L4x4_SUB2 + +CTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L4x4_SAVE + +CTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt CTRMM_L4x4_SUB2 + +CTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +CTRMM_L4x4_END: + +CTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble CTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L4x2_SUB4 + +CTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble CTRMM_L4x2_LOOP_END + + .align 5 + +CTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt CTRMM_L4x2_LOOP + +CTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b CTRMM_L4x2_SUB1 + +CTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b CTRMM_L4x2_SUB1 + +CTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble CTRMM_L4x2_SAVE + b CTRMM_L4x2_SUB2 + +CTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L4x2_SAVE + +CTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt CTRMM_L4x2_SUB2 + +CTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +CTRMM_L4x2_END: + +CTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble CTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L4x1_SUB4 + +CTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble CTRMM_L4x1_LOOP_END + + .align 5 + +CTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt CTRMM_L4x1_LOOP + +CTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b CTRMM_L4x1_SUB1 + +CTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b CTRMM_L4x1_SUB1 + +CTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble CTRMM_L4x1_SAVE + b CTRMM_L4x1_SUB2 + +CTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L4x1_SAVE + +CTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt CTRMM_L4x1_SUB2 + +CTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +CTRMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt CTRMM_L4_BEGIN + + andi. T2, N, 3 + ble L999_H2 + +CTRMM_L4_END: + + b CTRMM_L2_BEGIN + +L999_H1: + + b L999_H2 + +CTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble CTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble CTRMM_L2x8_END + +CTRMM_L2x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L2x8_SUB4 + +CTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble CTRMM_L2x8_LOOP_END + + .align 5 + +CTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt CTRMM_L2x8_LOOP + +CTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b CTRMM_L2x8_SUB1 + +CTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b CTRMM_L2x8_SUB1 + +CTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble CTRMM_L2x8_SAVE + b CTRMM_L2x8_SUB2 + +CTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L2x8_SAVE + +CTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt CTRMM_L2x8_SUB2 + +CTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt CTRMM_L2x8_BEGIN + +CTRMM_L2x8_END: + +CTRMM_L2x4_BEGIN: + andi. T2, M, 7 + ble CTRMM_L2x1_END + + andi. T1, M, 4 + ble CTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L2x4_SUB4 + +CTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble CTRMM_L2x4_LOOP_END + + .align 5 + +CTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt CTRMM_L2x4_LOOP + +CTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b CTRMM_L2x4_SUB1 + +CTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b CTRMM_L2x4_SUB1 + +CTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble CTRMM_L2x4_SAVE + b CTRMM_L2x4_SUB2 + +CTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L2x4_SAVE + +CTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt CTRMM_L2x4_SUB2 + +CTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +CTRMM_L2x4_END: + +CTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble CTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L2x2_SUB4 + +CTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble CTRMM_L2x2_LOOP_END + + .align 5 + +CTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt CTRMM_L2x2_LOOP + +CTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b CTRMM_L2x2_SUB1 + +CTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b CTRMM_L2x2_SUB1 + +CTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble CTRMM_L2x2_SAVE + b CTRMM_L2x2_SUB2 + +CTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L2x2_SAVE + +CTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt CTRMM_L2x2_SUB2 + +CTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +CTRMM_L2x2_END: + +CTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble CTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L2x1_SUB4 + +CTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble CTRMM_L2x1_LOOP_END + + .align 5 + +CTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt CTRMM_L2x1_LOOP + +CTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b CTRMM_L2x1_SUB1 + +CTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b CTRMM_L2x1_SUB1 + +CTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble CTRMM_L2x1_SAVE + b CTRMM_L2x1_SUB2 + +CTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L2x1_SAVE + +CTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt CTRMM_L2x1_SUB2 + +CTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +CTRMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +CTRMM_L2_END: + + b CTRMM_L1_BEGIN + +L999_H2: + + b L999 + +CTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble CTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble CTRMM_L1x8_END + +CTRMM_L1x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L1x8_SUB4 + +CTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble CTRMM_L1x8_LOOP_END + + .align 5 + +CTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt CTRMM_L1x8_LOOP + +CTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b CTRMM_L1x8_SUB1 + +CTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b CTRMM_L1x8_SUB1 + +CTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble CTRMM_L1x8_SAVE + b CTRMM_L1x8_SUB2 + +CTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L1x8_SAVE + +CTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt CTRMM_L1x8_SUB2 + +CTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt CTRMM_L1x8_BEGIN + +CTRMM_L1x8_END: + +CTRMM_L1x4_BEGIN: + andi. T2, M, 7 + ble CTRMM_L1x1_END + + andi. T1, M, 4 + ble CTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L1x4_SUB4 + +CTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble CTRMM_L1x4_LOOP_END + + .align 5 + +CTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt CTRMM_L1x4_LOOP + +CTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b CTRMM_L1x4_SUB1 + +CTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b CTRMM_L1x4_SUB1 + +CTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble CTRMM_L1x4_SAVE + b CTRMM_L1x4_SUB2 + +CTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L1x4_SAVE + +CTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt CTRMM_L1x4_SUB2 + +CTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +CTRMM_L1x4_END: + +CTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble CTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L1x2_SUB4 + +CTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble CTRMM_L1x2_LOOP_END + + .align 5 + +CTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt CTRMM_L1x2_LOOP + +CTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b CTRMM_L1x2_SUB1 + +CTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b CTRMM_L1x2_SUB1 + +CTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble CTRMM_L1x2_SAVE + b CTRMM_L1x2_SUB2 + +CTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L1x2_SAVE + +CTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt CTRMM_L1x2_SUB2 + +CTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +CTRMM_L1x2_END: + +CTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble CTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble CTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble CTRMM_L1x1_SUB4 + +CTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble CTRMM_L1x1_LOOP_END + + .align 5 + +CTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt CTRMM_L1x1_LOOP + +CTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b CTRMM_L1x1_SUB1 + +CTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b CTRMM_L1x1_SUB1 + +CTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble CTRMM_L1x1_SAVE + b CTRMM_L1x1_SUB2 + +CTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble CTRMM_L1x1_SAVE + +CTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt CTRMM_L1x1_SUB2 + +CTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +CTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +CTRMM_L1_END: diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S new file mode 100644 index 000000000..48a21252c --- /dev/null +++ b/kernel/power/ctrmm_macros_8x4_power8.S @@ -0,0 +1,6794 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvaddsp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvaddsp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvsubsp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvsubsp + +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs48, 0 + xxspltw vs9, vs48, 1 + xxspltw vs10, vs48, 2 + xxspltw vs11, vs48, 3 + + + xxspltw vs12, vs49, 0 + xxspltw vs13, vs49, 1 + xxspltw vs14, vs49, 2 + xxspltw vs15, vs49, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs50, 0 + xxspltw vs9, vs50, 1 + xxspltw vs10, vs50, 2 + xxspltw vs11, vs50, 3 + + + xxspltw vs12, vs51, 0 + xxspltw vs13, vs51, 1 + xxspltw vs14, vs51, 2 + xxspltw vs15, vs51, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs52, 0 + xxspltw vs9, vs52, 1 + xxspltw vs10, vs52, 2 + xxspltw vs11, vs52, 3 + + + xxspltw vs12, vs53, 0 + xxspltw vs13, vs53, 1 + xxspltw vs14, vs53, 2 + xxspltw vs15, vs53, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs54, 0 + xxspltw vs9, vs54, 1 + xxspltw vs10, vs54, 2 + xxspltw vs11, vs54, 3 + + + xxspltw vs12, vs55, 0 + xxspltw vs13, vs55, 1 + xxspltw vs14, vs55, 2 + xxspltw vs15, vs55, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs56, 0 + xxspltw vs9, vs56, 1 + xxspltw vs10, vs56, 2 + xxspltw vs11, vs56, 3 + + + xxspltw vs12, vs57, 0 + xxspltw vs13, vs57, 1 + xxspltw vs14, vs57, 2 + xxspltw vs15, vs57, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs58, 0 + xxspltw vs9, vs58, 1 + xxspltw vs10, vs58, 2 + xxspltw vs11, vs58, 3 + + + xxspltw vs12, vs59, 0 + xxspltw vs13, vs59, 1 + xxspltw vs14, vs59, 2 + xxspltw vs15, vs59, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs60, 0 + xxspltw vs9, vs60, 1 + xxspltw vs10, vs60, 2 + xxspltw vs11, vs60, 3 + + + xxspltw vs12, vs61, 0 + xxspltw vs13, vs61, 1 + xxspltw vs14, vs61, 2 + xxspltw vs15, vs61, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs62, 0 + xxspltw vs9, vs62, 1 + xxspltw vs10, vs62, 2 + xxspltw vs11, vs62, 3 + + + xxspltw vs12, vs63, 0 + xxspltw vs13, vs63, 1 + xxspltw vs14, vs63, 2 + xxspltw vs15, vs63, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c new file mode 100644 index 000000000..77f5345ba --- /dev/null +++ b/kernel/power/dasum.c @@ -0,0 +1,144 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "dasum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + FLOAT svec[2] __attribute__ ((aligned (16)));; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + dasum_kernel_16(n1, x, svec); + sumf = svec[0] + svec[1]; + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} + + diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c new file mode 100644 index 000000000..cc38c4f7d --- /dev/null +++ b/kernel/power/dasum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "addic. %0 , %0 , -16 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + + "stxvd2x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c new file mode 100644 index 000000000..4365bd88d --- /dev/null +++ b/kernel/power/daxpy.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "daxpy_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + FLOAT a = *alpha; + + while(i < n) + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT a2[4]; + a2[0]=da; + a2[1]=da; + a2[2]=da; + a2[3]=da; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + daxpy_kernel_8(n1, x, y , a2 ); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c new file mode 100644 index 000000000..bb3f73aca --- /dev/null +++ b/kernel/power/daxpy_microk_power8.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *y2=y+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxsdx 33, %5, %4 \n\t" + "xxspltd 32, 33, 0 \n\t" + "addi %8, %8, -8 \n\t" + + "dcbt %2, %9 \n\t" + "dcbt %3, %9 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "lxvd2x 44, 0, %2 \n\t" + "lxvd2x 45, %5, %2 \n\t" + "lxvd2x 46, %6, %2 \n\t" + "lxvd2x 47, %7, %2 \n\t" + + "lxvd2x 52, 0, %3 \n\t" + "lxvd2x 53, %5, %3 \n\t" + "lxvd2x 54, %6, %3 \n\t" + "lxvd2x 55, %7, %3 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %9 \n\t" + "dcbt %3, %9 \n\t" + + "xvmaddadp 48, 40, 32 \n\t" + "xvmaddadp 49, 41, 32 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %8, %8, 64 \n\t" + + "xvmaddadp 52, 44, 32 \n\t" + "addi %3, %3, 64 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + + "lxvd2x 44, 0, %2 \n\t" + "lxvd2x 45, %5, %2 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + + "lxvd2x 46, %6, %2 \n\t" + "lxvd2x 47, %7, %2 \n\t" + + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %8, %8, 64 \n\t" + + "lxvd2x 52, 0, %3 \n\t" + "lxvd2x 53, %5, %3 \n\t" + "lxvd2x 54, %6, %3 \n\t" + "lxvd2x 55, %7, %3 \n\t" + + "addi %3, %3, 64 \n\t" + + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + + "xvmaddadp 48, 40, 32 \n\t" + "xvmaddadp 49, 41, 32 \n\t" + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "xvmaddadp 52, 44, 32 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (alpha), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (y2), // 8 + "r" (pre) // 9 + : "cr0", "%0", "%2" , "%3", "%8", "memory" + ); + +} + + diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c new file mode 100644 index 000000000..059c0e5a9 --- /dev/null +++ b/kernel/power/dcopy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "dcopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + dcopy_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c new file mode 100644 index 000000000..04f7db556 --- /dev/null +++ b/kernel/power/dcopy_microk_power8.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c new file mode 100644 index 000000000..cef60a2e5 --- /dev/null +++ b/kernel/power/ddot.c @@ -0,0 +1,139 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/20 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "ddot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + ddot_kernel_8(n1, x, y , &dot ); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + + while(i < n) + { + + temp1 += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + dot = temp1 + temp2; + return(dot); + +} + + diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c new file mode 100644 index 000000000..b88049212 --- /dev/null +++ b/kernel/power/ddot_microk_power8.c @@ -0,0 +1,178 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/20 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 52, %8, %3 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 53, %9, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 54, %10, %3 \n\t" + "lxvd2x 47, %11, %2 \n\t" + "lxvd2x 55, %11, %3 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 52, %8, %3 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 53, %9, %3 \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 54, %10, %3 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + + "lxvd2x 47, %11, %2 \n\t" + "lxvd2x 55, %11, %3 \n\t" + + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + "xxswapd 33, 32 \n\t" + + "xsadddp 32, 32, 33 \n\t" + + "stxsdx 32, 0, %4 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (dot), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112), // 11 + "r" (pre) // 12 + : "cr0", "%0", "%2" , "%3", "memory" + ); + +} + + diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c new file mode 100644 index 000000000..812d09d15 --- /dev/null +++ b/kernel/power/dgemv_n.c @@ -0,0 +1,426 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + +#include "common.h" + + +#if defined(POWER8) +#include "dgemv_n_microk_power8.c" +#endif + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i<2; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; + } +} + + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap; + + for ( i=0; i<1; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +#endif + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c new file mode 100644 index 000000000..9eabe555c --- /dev/null +++ b/kernel/power/dgemv_n_microk_power8.c @@ -0,0 +1,301 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_4x4 1 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i=n; + BLASLONG o8 = 8; + BLASLONG o16 = 16; + BLASLONG o24 = 24; + BLASLONG pre = 384; + + FLOAT *a0,*a1,*a2,*a3; + FLOAT *y1=y+1; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap[0]+1; + a1 = ap[1]+1; + a2 = ap[2]+1; + a3 = ap[3]+1; + + x[0]=xo[0] * *alpha; + x[1]=xo[1] * *alpha; + x[2]=xo[2] * *alpha; + x[3]=xo[3] * *alpha; + + + __asm__ __volatile__ + ( + "lxvdsx 32, 0 , %1 \n\t" // x0 + "lxvdsx 33,%3 , %1 \n\t" // x1 + "lxvdsx 34,%4 , %1 \n\t" // x2 + "lxvdsx 35,%5 , %1 \n\t" // x3 + "addi %2 , %2 , -8 \n\t" + "addi %6 , %6 , -8 \n\t" + "addi %7 , %7 , -8 \n\t" + "addi %8 , %8 , -8 \n\t" + "addi %9 , %9 , -8 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %6, %6, 32 \n\t" + "addi %7, %7, 32 \n\t" + "addi %8, %8, 32 \n\t" + "addi %9, %9, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %10 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "dcbt %6, %10 \n\t" + "dcbt %7, %10 \n\t" + "dcbt %8, %10 \n\t" + "dcbt %9, %10 \n\t" + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "xvmaddadp 40, 50, 33 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "xvmaddadp 40, 52, 34 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "xvmaddadp 40, 54, 35 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + : + : + "r" (i), // 0 + "r" (x), // 1 + "r" (y1), // 2 + "r" (o8), // 3 + "r" (o16), // 4 + "r" (o24), // 5 + "r" (a0), // 6 + "r" (a1), // 7 + "r" (a2), // 8 + "r" (a3), // 9 + "r" (pre) // 10 + : "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory" + ); + +} + + diff --git a/kernel/power/drot.c b/kernel/power/drot.c new file mode 100644 index 000000000..c93f69b12 --- /dev/null +++ b/kernel/power/drot.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + +#include "common.h" + +#pragma GCC optimize "O1" + +#if defined(POWER8) +#include "drot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3; + FLOAT x00, x01, x02, x03; + FLOAT g0, g1, g2, g3; + FLOAT y00, y01, y02, y03; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT c1=*c; + FLOAT s1=*s; + + while ( i 0 ) + { + c1[0]=c; + c1[1]=c; + c1[2]=c; + c1[3]=c; + s1[0]=s; + s1[1]=s; + s1[2]=s; + s1[3]=s; + drot_kernel_16(n1, x1, y1, c1, s1); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c new file mode 100644 index 000000000..4444ac7eb --- /dev/null +++ b/kernel/power/drot_microk_power8.c @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* +* I don't use fused multiply-add ( precision problems with lapack ) +* +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); + +static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + + __asm__ __volatile__ + ( + + "lxsdx 36 , %5, %3 \n\t" // load c + "lxsdx 37 , %5, %4 \n\t" // load s + "addi %8 , %8, -8 \n\t" + "addi %9 , %9, -8 \n\t" + + "xxspltd 36 , 36, 0 \n\t" + "xxspltd 37 , 37, 0 \n\t" + + "lxvd2x 32, 0, %1 \n\t" // load x + "lxvd2x 33, %5, %1 \n\t" + "lxvd2x 34, %6, %1 \n\t" + "lxvd2x 35, %7, %1 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // load y + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "xvmuldp 48, 32, 36 \n\t" // c * x + "xvmuldp 49, 33, 36 \n\t" + "xvmuldp 50, 34, 36 \n\t" + "xvmuldp 51, 35, 36 \n\t" + + "xvmuldp 56, 40, 36 \n\t" // c * y + "xvmuldp 57, 41, 36 \n\t" + "xvmuldp 58, 42, 36 \n\t" + "xvmuldp 59, 43, 36 \n\t" + + "xvmuldp 52, 32, 37 \n\t" // s * x + "xvmuldp 53, 33, 37 \n\t" + + "lxvd2x 32, 0, %1 \n\t" // load x + "lxvd2x 33, %5, %1 \n\t" + + "xvmuldp 54, 34, 37 \n\t" + "xvmuldp 55, 35, 37 \n\t" + + "lxvd2x 34, %6, %1 \n\t" + "lxvd2x 35, %7, %1 \n\t" + + "xvmuldp 60, 40, 37 \n\t" // s * y + "xvmuldp 61, 41, 37 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // load y + "lxvd2x 41, %5, %2 \n\t" + + "xvmuldp 62, 42, 37 \n\t" + "xvmuldp 63, 43, 37 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvadddp 48, 48 , 60 \n\t" // c * x + s * y + "xvadddp 49, 49 , 61 \n\t" // c * x + s * y + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "xvadddp 50, 50 , 62 \n\t" // c * x + s * y + "xvadddp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvd2x 48, 0, %8 \n\t" // store x + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "stxvd2x 56, 0, %9 \n\t" // store y + "stxvd2x 57, %5, %9 \n\t" + "stxvd2x 58, %6, %9 \n\t" + "stxvd2x 59, %7, %9 \n\t" + + "addi %8, %8, 64 \n\t" + "addi %9, %9, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmuldp 48, 32, 36 \n\t" // c * x + "xvmuldp 49, 33, 36 \n\t" + "xvmuldp 50, 34, 36 \n\t" + "xvmuldp 51, 35, 36 \n\t" + + "xvmuldp 56, 40, 36 \n\t" // c * y + "xvmuldp 57, 41, 36 \n\t" + "xvmuldp 58, 42, 36 \n\t" + "xvmuldp 59, 43, 36 \n\t" + + "xvmuldp 52, 32, 37 \n\t" // s * x + "xvmuldp 53, 33, 37 \n\t" + "xvmuldp 54, 34, 37 \n\t" + "xvmuldp 55, 35, 37 \n\t" + + "xvmuldp 60, 40, 37 \n\t" // s * y + "xvmuldp 61, 41, 37 \n\t" + "xvmuldp 62, 42, 37 \n\t" + "xvmuldp 63, 43, 37 \n\t" + + "xvadddp 48, 48 , 60 \n\t" // c * x + s * y + "xvadddp 49, 49 , 61 \n\t" // c * x + s * y + "xvadddp 50, 50 , 62 \n\t" // c * x + s * y + "xvadddp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvd2x 48, 0, %8 \n\t" // store x + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "stxvd2x 56, 0, %9 \n\t" // store y + "stxvd2x 57, %5, %9 \n\t" + "stxvd2x 58, %6, %9 \n\t" + "stxvd2x 59, %7, %9 \n\t" + + + + : + : + "r" (i), // 0 + "r" (x1), // 1 + "r" (y1), // 2 + "r" (c), // 3 + "r" (s), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (x2), // 8 + "r" (y2) // 9 + : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory" + ); + +} + + diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c new file mode 100644 index 000000000..c62a56315 --- /dev/null +++ b/kernel/power/dscal.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "dscal_microk_power8.c" +#endif + +#if !defined(HAVE_KERNEL_8) + +static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + FLOAT alpha[2]; + alpha[0]=da; + alpha[1]=da; + dscal_kernel_8_zero(n1 , alpha , x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + FLOAT alpha[2]; + alpha[0]=da; + alpha[1]=da; + dscal_kernel_8(n1 , alpha , x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c new file mode 100644 index 000000000..d90c3d80c --- /dev/null +++ b/kernel/power/dscal_microk_power8.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxsdx 33, 0, %3 \n\t" + "xxspltd 32, 33, 0 \n\t" + "addi %1, %1, -8 \n\t" + + "dcbt %2, %4 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %4 \n\t" + + "xvmuldp 48, 40, 32 \n\t" + "xvmuldp 49, 41, 32 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmuldp 48, 40, 32 \n\t" + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "xxlxor 32 , 32 , 32 \n\t" + "addi %1, %1, -8 \n\t" + + + ".align 5 \n\t" + "1: \n\t" + + "stxvd2x 32, 0, %1 \n\t" + "stxvd2x 32, %5, %1 \n\t" + "stxvd2x 32, %6, %1 \n\t" + "stxvd2x 32, %7, %1 \n\t" + "stxvd2x 32, %8, %1 \n\t" + "stxvd2x 32, %9, %1 \n\t" + "stxvd2x 32, %10, %1 \n\t" + "stxvd2x 32, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c new file mode 100644 index 000000000..fd2dec9c4 --- /dev/null +++ b/kernel/power/dswap.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "dswap_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + dswap_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c new file mode 100644 index 000000000..77747c3b9 --- /dev/null +++ b/kernel/power/dswap_microk_power8.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -8 \n\t" + "addi %4, %4, -8 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 48, 0, %1 \n\t" + "lxvd2x 49, %5, %1 \n\t" + "lxvd2x 50, %6, %1 \n\t" + "lxvd2x 51, %7, %1 \n\t" + "lxvd2x 52, %8, %1 \n\t" + "lxvd2x 53, %9, %1 \n\t" + "lxvd2x 54, %10, %1 \n\t" + "lxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "lxvd2x 56, 0, %1 \n\t" + "lxvd2x 57, %5, %1 \n\t" + "lxvd2x 58, %6, %1 \n\t" + "lxvd2x 59, %7, %1 \n\t" + "lxvd2x 60, %8, %1 \n\t" + "lxvd2x 61, %9, %1 \n\t" + "lxvd2x 62, %10, %1 \n\t" + "lxvd2x 63, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 52, %8, %4 \n\t" + "stxvd2x 53, %9, %4 \n\t" + "stxvd2x 54, %10, %4 \n\t" + "stxvd2x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "stxvd2x 56, 0, %4 \n\t" + "stxvd2x 57, %5, %4 \n\t" + "stxvd2x 58, %6, %4 \n\t" + "stxvd2x 59, %7, %4 \n\t" + "stxvd2x 60, %8, %4 \n\t" + "stxvd2x 61, %9, %4 \n\t" + "stxvd2x 62, %10, %4 \n\t" + "stxvd2x 63, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c new file mode 100644 index 000000000..43311f2ba --- /dev/null +++ b/kernel/power/sasum.c @@ -0,0 +1,146 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "sasum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_32 + +static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + svec[2] = 0.0; + svec[3] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + FLOAT svec[4] __attribute__ ((aligned (16)));; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sasum_kernel_32(n1, x, svec); + sumf = svec[0] + svec[1]+svec[2]+svec[3]; + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} + + diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c new file mode 100644 index 000000000..847fffe04 --- /dev/null +++ b/kernel/power/sasum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 +static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "addic. %0 , %0 , -32 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + + "stxvw4x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c new file mode 100644 index 000000000..167c29bab --- /dev/null +++ b/kernel/power/scopy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "scopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + scopy_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c new file mode 100644 index 000000000..2e08e3561 --- /dev/null +++ b/kernel/power/scopy_microk_power8.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c new file mode 100644 index 000000000..52fb1fe24 --- /dev/null +++ b/kernel/power/sdot.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "sdot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + sdot_kernel_16(n1, x, y , &dot ); + + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c new file mode 100644 index 000000000..6dd588acd --- /dev/null +++ b/kernel/power/sdot_microk_power8.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + FLOAT tempdot[4]; + + + __asm__ __volatile__ + ( + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 48, 0, %3 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 49, %5, %3 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 50, %6, %3 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 51, %7, %3 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 52, %8, %3 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 53, %9, %3 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 54, %10, %3 \n\t" + "lxvw4x 47, %11, %2 \n\t" + "lxvw4x 55, %11, %3 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 48, 0, %3 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 49, %5, %3 \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 50, %6, %3 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 51, %7, %3 \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 52, %8, %3 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 53, %9, %3 \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 54, %10, %3 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + + "lxvw4x 47, %11, %2 \n\t" + "lxvw4x 55, %11, %3 \n\t" + + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + + "xvaddsp 32, 32 , 33 \n\t" + "xvaddsp 34, 34 , 35 \n\t" + "xvaddsp 36, 36 , 37 \n\t" + "xvaddsp 38, 38 , 39 \n\t" + + "xvaddsp 32, 32 , 34 \n\t" + "xvaddsp 36, 36 , 38 \n\t" + + "xvaddsp 32, 32 , 36 \n\t" + + "stxvw4x 32, 0 , %4 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (tempdot), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112), // 11 + "r" (pre) // 12 + : "cr0", "%0", "%2" , "%3", "memory" + ); + + *dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3]; + + +} + + diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S new file mode 100644 index 000000000..77f3f7cfb --- /dev/null +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -0,0 +1,371 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 32752 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define FRAMEPOINTER r12 + +#define BBUFFER r14 +#define o4 r15 +#define o12 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BBO r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "sgemm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif +#endif + + slwi LDC, LDC, 2 + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#else + lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 + + addi T1, SP, 300 + stxsspx f1, o0 , T1 + stxsspx f1, o4 , T1 + stxsspx f1, o8 , T1 + stxsspx f1, o12 , T1 + + lxsspx alpha_r, o0, T1 + lxvw4x alpha_vr, o0, T1 + + + +#include "sgemm_logic_16x8_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) +#endif + + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S new file mode 100644 index 000000000..06bb79ea3 --- /dev/null +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -0,0 +1,2323 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 3 + ble SGEMM_L8_END + +SGEMM_L8_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 3 + +SGEMM_L8_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L8_COPYB + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + srawi. I, M, 4 + ble SGEMM_L8x16_END + +SGEMM_L8x16_BEGIN: + + + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L8x16_SUB4 + +SGEMM_L8x16_LOOP_START: + + dcbt AO, PRE + dcbt BO, PRE + LOAD8x16_1 + dcbt BO, PRE + KERNEL8x16_I1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble SGEMM_L8x16_LOOP_END + + .align 5 + +SGEMM_L8x16_LOOP: + + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt SGEMM_L8x16_LOOP + +SGEMM_L8x16_LOOP_END: + + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + + dcbt BO, PRE + KERNEL8x16_1 + dcbt BO, PRE + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + KERNEL8x16_E2 + + b SGEMM_L8x16_SUB1 + +SGEMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b SGEMM_L8x16_SUB1 + +SGEMM_L8x16_SUB0: + + andi. L, K, 7 + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble SGEMM_L8x16_SAVE + b SGEMM_L8x16_SUB2 + +SGEMM_L8x16_SUB1: + + andi. L, K, 7 + ble SGEMM_L8x16_SAVE + +SGEMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt SGEMM_L8x16_SUB2 + +SGEMM_L8x16_SAVE: + + SAVE8x16 + + addic. I, I, -1 + bgt SGEMM_L8x16_BEGIN + +SGEMM_L8x16_END: + +SGEMM_L8x8_BEGIN: + + andi. T2, M, 15 + ble SGEMM_L8x1_END + + andi. T1, M, 8 + ble SGEMM_L8x8_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L8x8_SUB4 + +SGEMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble SGEMM_L8x8_LOOP_END + + .align 5 + +SGEMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt SGEMM_L8x8_LOOP + +SGEMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b SGEMM_L8x8_SUB1 + +SGEMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b SGEMM_L8x8_SUB1 + +SGEMM_L8x8_SUB0: + + andi. L, K, 7 + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble SGEMM_L8x8_SAVE + b SGEMM_L8x8_SUB2 + +SGEMM_L8x8_SUB1: + + andi. L, K, 7 + ble SGEMM_L8x8_SAVE + +SGEMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt SGEMM_L8x8_SUB2 + +SGEMM_L8x8_SAVE: + + SAVE8x8 + +SGEMM_L8x8_END: + +SGEMM_L8x4_BEGIN: + + + andi. T1, M, 4 + ble SGEMM_L8x4_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L8x4_SUB4 + +SGEMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble SGEMM_L8x4_LOOP_END + + .align 5 + +SGEMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt SGEMM_L8x4_LOOP + +SGEMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b SGEMM_L8x4_SUB1 + +SGEMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b SGEMM_L8x4_SUB1 + +SGEMM_L8x4_SUB0: + + andi. L, K, 7 + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble SGEMM_L8x4_SAVE + b SGEMM_L8x4_SUB2 + +SGEMM_L8x4_SUB1: + + andi. L, K, 7 + ble SGEMM_L8x4_SAVE + +SGEMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt SGEMM_L8x4_SUB2 + +SGEMM_L8x4_SAVE: + + SAVE8x4 + +SGEMM_L8x4_END: + +SGEMM_L8x2_BEGIN: + + + andi. T1, M, 2 + ble SGEMM_L8x2_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L8x2_SUB4 + +SGEMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble SGEMM_L8x2_LOOP_END + + .align 5 + +SGEMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt SGEMM_L8x2_LOOP + +SGEMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b SGEMM_L8x2_SUB1 + +SGEMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b SGEMM_L8x2_SUB1 + +SGEMM_L8x2_SUB0: + + andi. L, K, 7 + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble SGEMM_L8x2_SAVE + b SGEMM_L8x2_SUB2 + +SGEMM_L8x2_SUB1: + + andi. L, K, 7 + ble SGEMM_L8x2_SAVE + +SGEMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt SGEMM_L8x2_SUB2 + +SGEMM_L8x2_SAVE: + + SAVE8x2 + +SGEMM_L8x2_END: + +SGEMM_L8x1_BEGIN: + + + andi. T1, M, 1 + ble SGEMM_L8x1_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L8x1_SUB4 + +SGEMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble SGEMM_L8x1_LOOP_END + + .align 5 + +SGEMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt SGEMM_L8x1_LOOP + +SGEMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b SGEMM_L8x1_SUB1 + +SGEMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b SGEMM_L8x1_SUB1 + +SGEMM_L8x1_SUB0: + + andi. L, K, 7 + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble SGEMM_L8x1_SAVE + b SGEMM_L8x1_SUB2 + +SGEMM_L8x1_SUB1: + + andi. L, K, 7 + ble SGEMM_L8x1_SAVE + +SGEMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt SGEMM_L8x1_SUB2 + +SGEMM_L8x1_SAVE: + + SAVE8x1 + +SGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt SGEMM_L8_BEGIN + + andi. T2, N, 7 + ble L999 + +SGEMM_L8_END: + + b SGEMM_L4_BEGIN + +L999_H1: + + b L999 + +SGEMM_L4_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 2 + +SGEMM_L4_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L4_COPYB + + andi. T1, N, 4 + ble SGEMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 4 + ble SGEMM_L4x16_END + +SGEMM_L4x16_BEGIN: + + + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L4x16_SUB4 + +SGEMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble SGEMM_L4x16_LOOP_END + + .align 5 + +SGEMM_L4x16_LOOP: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt SGEMM_L4x16_LOOP + +SGEMM_L4x16_LOOP_END: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + KERNEL4x16_E2 + + b SGEMM_L4x16_SUB1 + +SGEMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b SGEMM_L4x16_SUB1 + +SGEMM_L4x16_SUB0: + + andi. L, K, 7 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble SGEMM_L4x16_SAVE + b SGEMM_L4x16_SUB2 + +SGEMM_L4x16_SUB1: + + andi. L, K, 7 + ble SGEMM_L4x16_SAVE + +SGEMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt SGEMM_L4x16_SUB2 + +SGEMM_L4x16_SAVE: + + SAVE4x16 + + addic. I, I, -1 + bgt SGEMM_L4x16_BEGIN + +SGEMM_L4x16_END: + +SGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble SGEMM_L4x1_END + + andi. T1, M, 8 + ble SGEMM_L4x8_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L4x8_SUB4 + +SGEMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble SGEMM_L4x8_LOOP_END + + .align 5 + +SGEMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt SGEMM_L4x8_LOOP + +SGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b SGEMM_L4x8_SUB1 + +SGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b SGEMM_L4x8_SUB1 + +SGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble SGEMM_L4x8_SAVE + b SGEMM_L4x8_SUB2 + +SGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble SGEMM_L4x8_SAVE + +SGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt SGEMM_L4x8_SUB2 + +SGEMM_L4x8_SAVE: + + SAVE4x8 + +SGEMM_L4x8_END: + +SGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble SGEMM_L4x4_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L4x4_SUB4 + +SGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble SGEMM_L4x4_LOOP_END + + .align 5 + +SGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt SGEMM_L4x4_LOOP + +SGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b SGEMM_L4x4_SUB1 + +SGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b SGEMM_L4x4_SUB1 + +SGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble SGEMM_L4x4_SAVE + b SGEMM_L4x4_SUB2 + +SGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble SGEMM_L4x4_SAVE + +SGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt SGEMM_L4x4_SUB2 + +SGEMM_L4x4_SAVE: + + SAVE4x4 + +SGEMM_L4x4_END: + +SGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble SGEMM_L4x2_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L4x2_SUB4 + +SGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble SGEMM_L4x2_LOOP_END + + .align 5 + +SGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt SGEMM_L4x2_LOOP + +SGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b SGEMM_L4x2_SUB1 + +SGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b SGEMM_L4x2_SUB1 + +SGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble SGEMM_L4x2_SAVE + b SGEMM_L4x2_SUB2 + +SGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble SGEMM_L4x2_SAVE + +SGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt SGEMM_L4x2_SUB2 + +SGEMM_L4x2_SAVE: + + SAVE4x2 + +SGEMM_L4x2_END: + +SGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble SGEMM_L4x1_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L4x1_SUB4 + +SGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble SGEMM_L4x1_LOOP_END + + .align 5 + +SGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt SGEMM_L4x1_LOOP + +SGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b SGEMM_L4x1_SUB1 + +SGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b SGEMM_L4x1_SUB1 + +SGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble SGEMM_L4x1_SAVE + b SGEMM_L4x1_SUB2 + +SGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble SGEMM_L4x1_SAVE + +SGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt SGEMM_L4x1_SUB2 + +SGEMM_L4x1_SAVE: + + SAVE4x1 + +SGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +SGEMM_L4_END: +SGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +SGEMM_L2_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L2_COPYB + + andi. T1, N, 2 + ble SGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble SGEMM_L2x16_END + +SGEMM_L2x16_BEGIN: + + + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L2x16_SUB4 + +SGEMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble SGEMM_L2x16_LOOP_END + + .align 5 + +SGEMM_L2x16_LOOP: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt SGEMM_L2x16_LOOP + +SGEMM_L2x16_LOOP_END: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + KERNEL2x16_E2 + + b SGEMM_L2x16_SUB1 + +SGEMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b SGEMM_L2x16_SUB1 + +SGEMM_L2x16_SUB0: + + andi. L, K, 7 + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble SGEMM_L2x16_SAVE + b SGEMM_L2x16_SUB2 + +SGEMM_L2x16_SUB1: + + andi. L, K, 7 + ble SGEMM_L2x16_SAVE + +SGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt SGEMM_L2x16_SUB2 + +SGEMM_L2x16_SAVE: + + SAVE2x16 + + addic. I, I, -1 + bgt SGEMM_L2x16_BEGIN + +SGEMM_L2x16_END: + +SGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble SGEMM_L2x1_END + + andi. T1, M, 8 + ble SGEMM_L2x8_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L2x8_SUB4 + +SGEMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble SGEMM_L2x8_LOOP_END + + .align 5 + +SGEMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt SGEMM_L2x8_LOOP + +SGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b SGEMM_L2x8_SUB1 + +SGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b SGEMM_L2x8_SUB1 + +SGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble SGEMM_L2x8_SAVE + b SGEMM_L2x8_SUB2 + +SGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble SGEMM_L2x8_SAVE + +SGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt SGEMM_L2x8_SUB2 + +SGEMM_L2x8_SAVE: + + SAVE2x8 + +SGEMM_L2x8_END: + +SGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble SGEMM_L2x4_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L2x4_SUB4 + +SGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble SGEMM_L2x4_LOOP_END + + .align 5 + +SGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt SGEMM_L2x4_LOOP + +SGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b SGEMM_L2x4_SUB1 + +SGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b SGEMM_L2x4_SUB1 + +SGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble SGEMM_L2x4_SAVE + b SGEMM_L2x4_SUB2 + +SGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble SGEMM_L2x4_SAVE + +SGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt SGEMM_L2x4_SUB2 + +SGEMM_L2x4_SAVE: + + SAVE2x4 + +SGEMM_L2x4_END: + +SGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble SGEMM_L2x2_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L2x2_SUB4 + +SGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble SGEMM_L2x2_LOOP_END + + .align 5 + +SGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt SGEMM_L2x2_LOOP + +SGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b SGEMM_L2x2_SUB1 + +SGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b SGEMM_L2x2_SUB1 + +SGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble SGEMM_L2x2_SAVE + b SGEMM_L2x2_SUB2 + +SGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble SGEMM_L2x2_SAVE + +SGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt SGEMM_L2x2_SUB2 + +SGEMM_L2x2_SAVE: + + SAVE2x2 + +SGEMM_L2x2_END: + +SGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble SGEMM_L2x1_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L2x1_SUB4 + +SGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble SGEMM_L2x1_LOOP_END + + .align 5 + +SGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt SGEMM_L2x1_LOOP + +SGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b SGEMM_L2x1_SUB1 + +SGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b SGEMM_L2x1_SUB1 + +SGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble SGEMM_L2x1_SAVE + b SGEMM_L2x1_SUB2 + +SGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble SGEMM_L2x1_SAVE + +SGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt SGEMM_L2x1_SUB2 + +SGEMM_L2x1_SAVE: + + SAVE2x1 + +SGEMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +SGEMM_L2_END: +SGEMM_L1_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 0 + +SGEMM_L1_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L1_COPYB + + andi. T1, N, 1 + ble SGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble SGEMM_L1x16_END + +SGEMM_L1x16_BEGIN: + + + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L1x16_SUB4 + +SGEMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble SGEMM_L1x16_LOOP_END + + .align 5 + +SGEMM_L1x16_LOOP: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt SGEMM_L1x16_LOOP + +SGEMM_L1x16_LOOP_END: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + KERNEL1x16_E2 + + b SGEMM_L1x16_SUB1 + +SGEMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b SGEMM_L1x16_SUB1 + +SGEMM_L1x16_SUB0: + + andi. L, K, 7 + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble SGEMM_L1x16_SAVE + b SGEMM_L1x16_SUB2 + +SGEMM_L1x16_SUB1: + + andi. L, K, 7 + ble SGEMM_L1x16_SAVE + +SGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt SGEMM_L1x16_SUB2 + +SGEMM_L1x16_SAVE: + + SAVE1x16 + + addic. I, I, -1 + bgt SGEMM_L1x16_BEGIN + +SGEMM_L1x16_END: + +SGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble SGEMM_L1x1_END + + andi. T1, M, 8 + ble SGEMM_L1x8_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L1x8_SUB4 + +SGEMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble SGEMM_L1x8_LOOP_END + + .align 5 + +SGEMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt SGEMM_L1x8_LOOP + +SGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b SGEMM_L1x8_SUB1 + +SGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b SGEMM_L1x8_SUB1 + +SGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble SGEMM_L1x8_SAVE + b SGEMM_L1x8_SUB2 + +SGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble SGEMM_L1x8_SAVE + +SGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt SGEMM_L1x8_SUB2 + +SGEMM_L1x8_SAVE: + + SAVE1x8 + +SGEMM_L1x8_END: + +SGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble SGEMM_L1x4_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L1x4_SUB4 + +SGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble SGEMM_L1x4_LOOP_END + + .align 5 + +SGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt SGEMM_L1x4_LOOP + +SGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b SGEMM_L1x4_SUB1 + +SGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b SGEMM_L1x4_SUB1 + +SGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble SGEMM_L1x4_SAVE + b SGEMM_L1x4_SUB2 + +SGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble SGEMM_L1x4_SAVE + +SGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt SGEMM_L1x4_SUB2 + +SGEMM_L1x4_SAVE: + + SAVE1x4 + +SGEMM_L1x4_END: + +SGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble SGEMM_L1x2_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L1x2_SUB4 + +SGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble SGEMM_L1x2_LOOP_END + + .align 5 + +SGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt SGEMM_L1x2_LOOP + +SGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b SGEMM_L1x2_SUB1 + +SGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b SGEMM_L1x2_SUB1 + +SGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble SGEMM_L1x2_SAVE + b SGEMM_L1x2_SUB2 + +SGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble SGEMM_L1x2_SAVE + +SGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt SGEMM_L1x2_SUB2 + +SGEMM_L1x2_SAVE: + + SAVE1x2 + +SGEMM_L1x2_END: + +SGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble SGEMM_L1x1_END + mr BO, BBUFFER + srawi. L, K, 3 + ble SGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble SGEMM_L1x1_SUB4 + +SGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble SGEMM_L1x1_LOOP_END + + .align 5 + +SGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt SGEMM_L1x1_LOOP + +SGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b SGEMM_L1x1_SUB1 + +SGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b SGEMM_L1x1_SUB1 + +SGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble SGEMM_L1x1_SAVE + b SGEMM_L1x1_SUB2 + +SGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble SGEMM_L1x1_SAVE + +SGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt SGEMM_L1x1_SUB2 + +SGEMM_L1x1_SAVE: + + SAVE1x1 + +SGEMM_L1x1_END: + +SGEMM_L1_END: diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S new file mode 100644 index 000000000..71dc52979 --- /dev/null +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -0,0 +1,5888 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + +.endm + +.macro KERNEL8x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro SAVE8x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr +#else + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr +#else + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr +#else + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr +#else + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + +.endm + +.macro KERNEL8x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr +#else + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr +#else + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + +.endm + +.macro KERNEL8x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs37, alpha_vr +#else + xvmaddasp vs0, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs39, alpha_vr +#else + xvmaddasp vs0, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + +.macro LOAD8x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + +.endm + +.macro KERNEL8x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + addi T1, T1, 64 + + lxsspx vs20, o0, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 + + + addi BO, BO, 128 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + addi T1, T1, 64 + + lxsspx vs20, o0, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 + + + addi BO, BO, 128 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs40, alpha_r + xsmuldp vs1, vs41, alpha_r +#else + xsmaddadp vs0, vs40, alpha_r + xsmaddadp vs1, vs41, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs42, alpha_r + xsmuldp vs1, vs43, alpha_r +#else + xsmaddadp vs0, vs42, alpha_r + xsmaddadp vs1, vs43, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs44, alpha_r + xsmuldp vs1, vs45, alpha_r +#else + xsmaddadp vs0, vs44, alpha_r + xsmaddadp vs1, vs45, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs46, alpha_r + xsmuldp vs1, vs47, alpha_r +#else + xsmaddadp vs0, vs46, alpha_r + xsmaddadp vs1, vs47, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ + +.macro LOAD8x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + +.endm + +.macro KERNEL8x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + addi T1, T1, 64 + + lxsspx vs20, o0, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 + + + addi BO, BO, 128 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + addi T1, T1, 64 + + lxsspx vs20, o0, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 + + + addi BO, BO, 128 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + addi T1, T1, 64 + + lxsspx vs12, o0, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 + + + addi BO, BO, 128 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs37, alpha_r +#else + xsmaddadp vs0, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs39, alpha_r +#else + xsmaddadp vs0, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + +.endm + +.macro KERNEL4x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi BO, BO, 64 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro SAVE4x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi BO, BO, 64 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi BO, BO, 64 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi BO, BO, 64 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + +.endm + +.macro KERNEL4x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + + addi BO, BO, 64 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + + addi BO, BO, 64 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + + addi BO, BO, 64 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 + + + addi BO, BO, 64 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 + + + addi BO, BO, 64 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro LOAD2x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL2x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro SAVE2x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL2x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o16, T1 + + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o16, T1 + + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro LOAD1x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL1x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro SAVE1x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs16, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs16, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + mr T1, BO + + lxvw4x vs8, o0, T1 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL1x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + diff --git a/kernel/power/srot.c b/kernel/power/srot.c new file mode 100644 index 000000000..d464846a4 --- /dev/null +++ b/kernel/power/srot.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/26 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + +#include "common.h" + +#pragma GCC optimize "O1" + +#if defined(POWER8) +#include "srot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3; + FLOAT x00, x01, x02, x03; + FLOAT g0, g1, g2, g3; + FLOAT y00, y01, y02, y03; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT c1=*c; + FLOAT s1=*s; + + while ( i 0 ) + { + c1[0]=c; + c1[1]=c; + c1[2]=c; + c1[3]=c; + s1[0]=s; + s1[1]=s; + s1[2]=s; + s1[3]=s; + srot_kernel_16(n1, x1, y1, c1, s1); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c new file mode 100644 index 000000000..ade65500f --- /dev/null +++ b/kernel/power/srot_microk_power8.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* +* I don't use fused multiply-add ( precision problems with lapack ) +* +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); + +static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + + __asm__ __volatile__ + ( + + "lxvw4x 36 , 0, %3 \n\t" // load c + "lxvw4x 37 , 0, %4 \n\t" // load s + "addi %8 , %8, -4 \n\t" + "addi %9 , %9, -4 \n\t" + + "lxvw4x 32, 0, %1 \n\t" // load x + "lxvw4x 33, %5, %1 \n\t" + "lxvw4x 34, %6, %1 \n\t" + "lxvw4x 35, %7, %1 \n\t" + + "lxvw4x 40, 0, %2 \n\t" // load y + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "xvmulsp 48, 32, 36 \n\t" // c * x + "xvmulsp 49, 33, 36 \n\t" + "xvmulsp 50, 34, 36 \n\t" + "xvmulsp 51, 35, 36 \n\t" + + "xvmulsp 56, 40, 36 \n\t" // c * y + "xvmulsp 57, 41, 36 \n\t" + "xvmulsp 58, 42, 36 \n\t" + "xvmulsp 59, 43, 36 \n\t" + + "xvmulsp 52, 32, 37 \n\t" // s * x + "xvmulsp 53, 33, 37 \n\t" + + "lxvw4x 32, 0, %1 \n\t" // load x + "lxvw4x 33, %5, %1 \n\t" + + "xvmulsp 54, 34, 37 \n\t" + "xvmulsp 55, 35, 37 \n\t" + + "lxvw4x 34, %6, %1 \n\t" + "lxvw4x 35, %7, %1 \n\t" + + "xvmulsp 60, 40, 37 \n\t" // s * y + "xvmulsp 61, 41, 37 \n\t" + + "lxvw4x 40, 0, %2 \n\t" // load y + "lxvw4x 41, %5, %2 \n\t" + + "xvmulsp 62, 42, 37 \n\t" + "xvmulsp 63, 43, 37 \n\t" + + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y + "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y + "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvw4x 48, 0, %8 \n\t" // store x + "stxvw4x 49, %5, %8 \n\t" + "stxvw4x 50, %6, %8 \n\t" + "stxvw4x 51, %7, %8 \n\t" + + "stxvw4x 56, 0, %9 \n\t" // store y + "stxvw4x 57, %5, %9 \n\t" + "stxvw4x 58, %6, %9 \n\t" + "stxvw4x 59, %7, %9 \n\t" + + "addi %8, %8, 64 \n\t" + "addi %9, %9, 64 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmulsp 48, 32, 36 \n\t" // c * x + "xvmulsp 49, 33, 36 \n\t" + "xvmulsp 50, 34, 36 \n\t" + "xvmulsp 51, 35, 36 \n\t" + + "xvmulsp 56, 40, 36 \n\t" // c * y + "xvmulsp 57, 41, 36 \n\t" + "xvmulsp 58, 42, 36 \n\t" + "xvmulsp 59, 43, 36 \n\t" + + "xvmulsp 52, 32, 37 \n\t" // s * x + "xvmulsp 53, 33, 37 \n\t" + "xvmulsp 54, 34, 37 \n\t" + "xvmulsp 55, 35, 37 \n\t" + + "xvmulsp 60, 40, 37 \n\t" // s * y + "xvmulsp 61, 41, 37 \n\t" + "xvmulsp 62, 42, 37 \n\t" + "xvmulsp 63, 43, 37 \n\t" + + "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y + "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y + "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y + "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvw4x 48, 0, %8 \n\t" // store x + "stxvw4x 49, %5, %8 \n\t" + "stxvw4x 50, %6, %8 \n\t" + "stxvw4x 51, %7, %8 \n\t" + + "stxvw4x 56, 0, %9 \n\t" // store y + "stxvw4x 57, %5, %9 \n\t" + "stxvw4x 58, %6, %9 \n\t" + "stxvw4x 59, %7, %9 \n\t" + + + + : + : + "r" (i), // 0 + "r" (x1), // 1 + "r" (y1), // 2 + "r" (c), // 3 + "r" (s), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (x2), // 8 + "r" (y2) // 9 + : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory" + ); + +} + + diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c new file mode 100644 index 000000000..c6ef5e969 --- /dev/null +++ b/kernel/power/sscal.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "sscal_microk_power8.c" +#endif + + +#if !defined(HAVE_KERNEL_16) + +static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + alpha[0]=da; + alpha[1]=da; + alpha[2]=da; + alpha[3]=da; + sscal_kernel_16_zero(n1 , alpha , x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + alpha[0]=da; + alpha[1]=da; + alpha[2]=da; + alpha[3]=da; + sscal_kernel_16(n1 , alpha , x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c new file mode 100644 index 000000000..963cec777 --- /dev/null +++ b/kernel/power/sscal_microk_power8.c @@ -0,0 +1,218 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxvw4x 32, 0, %3 \n\t" + "addi %1, %1, -4 \n\t" + + "dcbt %2, %4 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %4 \n\t" + + "xvmulsp 48, 40, 32 \n\t" + "xvmulsp 49, 41, 32 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "stxvw4x 48, 0, %1 \n\t" + "stxvw4x 49, %5, %1 \n\t" + "stxvw4x 50, %6, %1 \n\t" + "stxvw4x 51, %7, %1 \n\t" + "stxvw4x 52, %8, %1 \n\t" + "stxvw4x 53, %9, %1 \n\t" + "stxvw4x 54, %10, %1 \n\t" + "stxvw4x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmulsp 48, 40, 32 \n\t" + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "stxvw4x 48, 0, %1 \n\t" + "stxvw4x 49, %5, %1 \n\t" + "stxvw4x 50, %6, %1 \n\t" + "stxvw4x 51, %7, %1 \n\t" + "stxvw4x 52, %8, %1 \n\t" + "stxvw4x 53, %9, %1 \n\t" + "stxvw4x 54, %10, %1 \n\t" + "stxvw4x 55, %11, %1 \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "xxlxor 32 , 32 , 32 \n\t" + "addi %1, %1, -4 \n\t" + + + ".align 5 \n\t" + "1: \n\t" + + "stxvw4x 32, 0, %1 \n\t" + "stxvw4x 32, %5, %1 \n\t" + "stxvw4x 32, %6, %1 \n\t" + "stxvw4x 32, %7, %1 \n\t" + "stxvw4x 32, %8, %1 \n\t" + "stxvw4x 32, %9, %1 \n\t" + "stxvw4x 32, %10, %1 \n\t" + "stxvw4x 32, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c new file mode 100644 index 000000000..932652b37 --- /dev/null +++ b/kernel/power/sswap.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "sswap_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + sswap_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c new file mode 100644 index 000000000..c48e743de --- /dev/null +++ b/kernel/power/sswap_microk_power8.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -4 \n\t" + "addi %4, %4, -4 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvw4x 32, 0, %2 \n\t" + "lxvw4x 33, %5, %2 \n\t" + "lxvw4x 34, %6, %2 \n\t" + "lxvw4x 35, %7, %2 \n\t" + "lxvw4x 36, %8, %2 \n\t" + "lxvw4x 37, %9, %2 \n\t" + "lxvw4x 38, %10, %2 \n\t" + "lxvw4x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 48, 0, %1 \n\t" + "lxvw4x 49, %5, %1 \n\t" + "lxvw4x 50, %6, %1 \n\t" + "lxvw4x 51, %7, %1 \n\t" + "lxvw4x 52, %8, %1 \n\t" + "lxvw4x 53, %9, %1 \n\t" + "lxvw4x 54, %10, %1 \n\t" + "lxvw4x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvw4x 32, 0, %3 \n\t" + "stxvw4x 33, %5, %3 \n\t" + "stxvw4x 34, %6, %3 \n\t" + "stxvw4x 35, %7, %3 \n\t" + "stxvw4x 36, %8, %3 \n\t" + "stxvw4x 37, %9, %3 \n\t" + "stxvw4x 38, %10, %3 \n\t" + "stxvw4x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvw4x 48, 0, %4 \n\t" + "stxvw4x 49, %5, %4 \n\t" + "stxvw4x 50, %6, %4 \n\t" + "stxvw4x 51, %7, %4 \n\t" + "stxvw4x 52, %8, %4 \n\t" + "stxvw4x 53, %9, %4 \n\t" + "stxvw4x 54, %10, %4 \n\t" + "stxvw4x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S new file mode 100644 index 000000000..f756d5d92 --- /dev/null +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -0,0 +1,369 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 340 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define TBUFFER r13 +#define o12 r14 +#define o4 r15 +#define K1 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define KKK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "strmm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + mr KK, OFFSET +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, KK +#endif + + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + addi TBUFFER, SP, 320 + + addi T1, SP, 300 + stxsspx f1, o0 , T1 + stxsspx f1, o4 , T1 + stxsspx f1, o8 , T1 + stxsspx f1, o12 , T1 + + lxsspx alpha_r, o0, T1 + lxvw4x alpha_vr, o0, T1 + + + +#include "strmm_logic_16x8_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S new file mode 100644 index 000000000..fb2d3f94b --- /dev/null +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -0,0 +1,2968 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 3 + ble STRMM_L8_END + +STRMM_L8_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble STRMM_L8x16_END + +STRMM_L8x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L8x16_SUB4 + +STRMM_L8x16_LOOP_START: + + dcbt AO, PRE + LOAD8x16_1 + dcbt AO, PRE + KERNEL8x16_I1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble STRMM_L8x16_LOOP_END + + .align 5 + +STRMM_L8x16_LOOP: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt STRMM_L8x16_LOOP + +STRMM_L8x16_LOOP_END: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + KERNEL8x16_E2 + + b STRMM_L8x16_SUB1 + +STRMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b STRMM_L8x16_SUB1 + +STRMM_L8x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble STRMM_L8x16_SAVE + b STRMM_L8x16_SUB2 + +STRMM_L8x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L8x16_SAVE + +STRMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt STRMM_L8x16_SUB2 + +STRMM_L8x16_SAVE: + + SAVE8x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt STRMM_L8x16_BEGIN + +STRMM_L8x16_END: + +STRMM_L8x8_BEGIN: + andi. T2, M, 15 + ble STRMM_L8x1_END + + andi. T1, M, 8 + ble STRMM_L8x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L8x8_SUB4 + +STRMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble STRMM_L8x8_LOOP_END + + .align 5 + +STRMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt STRMM_L8x8_LOOP + +STRMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b STRMM_L8x8_SUB1 + +STRMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b STRMM_L8x8_SUB1 + +STRMM_L8x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble STRMM_L8x8_SAVE + b STRMM_L8x8_SUB2 + +STRMM_L8x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L8x8_SAVE + +STRMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt STRMM_L8x8_SUB2 + +STRMM_L8x8_SAVE: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +STRMM_L8x8_END: + +STRMM_L8x4_BEGIN: + + andi. T1, M, 4 + ble STRMM_L8x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L8x4_SUB4 + +STRMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble STRMM_L8x4_LOOP_END + + .align 5 + +STRMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt STRMM_L8x4_LOOP + +STRMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b STRMM_L8x4_SUB1 + +STRMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b STRMM_L8x4_SUB1 + +STRMM_L8x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble STRMM_L8x4_SAVE + b STRMM_L8x4_SUB2 + +STRMM_L8x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L8x4_SAVE + +STRMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt STRMM_L8x4_SUB2 + +STRMM_L8x4_SAVE: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +STRMM_L8x4_END: + +STRMM_L8x2_BEGIN: + + andi. T1, M, 2 + ble STRMM_L8x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L8x2_SUB4 + +STRMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble STRMM_L8x2_LOOP_END + + .align 5 + +STRMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt STRMM_L8x2_LOOP + +STRMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b STRMM_L8x2_SUB1 + +STRMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b STRMM_L8x2_SUB1 + +STRMM_L8x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble STRMM_L8x2_SAVE + b STRMM_L8x2_SUB2 + +STRMM_L8x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L8x2_SAVE + +STRMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt STRMM_L8x2_SUB2 + +STRMM_L8x2_SAVE: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +STRMM_L8x2_END: + +STRMM_L8x1_BEGIN: + + andi. T1, M, 1 + ble STRMM_L8x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L8x1_SUB4 + +STRMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble STRMM_L8x1_LOOP_END + + .align 5 + +STRMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt STRMM_L8x1_LOOP + +STRMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b STRMM_L8x1_SUB1 + +STRMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b STRMM_L8x1_SUB1 + +STRMM_L8x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble STRMM_L8x1_SAVE + b STRMM_L8x1_SUB2 + +STRMM_L8x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L8x1_SAVE + +STRMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt STRMM_L8x1_SUB2 + +STRMM_L8x1_SAVE: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +STRMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 8 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt STRMM_L8_BEGIN + + andi. T2, N, 7 + ble L999 + +STRMM_L8_END: + + b STRMM_L4_BEGIN + +L999_H1: + + b L999 + +STRMM_L4_BEGIN: + + andi. T1, N, 4 + ble STRMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble STRMM_L4x16_END + +STRMM_L4x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L4x16_SUB4 + +STRMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble STRMM_L4x16_LOOP_END + + .align 5 + +STRMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt STRMM_L4x16_LOOP + +STRMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b STRMM_L4x16_SUB1 + +STRMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b STRMM_L4x16_SUB1 + +STRMM_L4x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble STRMM_L4x16_SAVE + b STRMM_L4x16_SUB2 + +STRMM_L4x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L4x16_SAVE + +STRMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt STRMM_L4x16_SUB2 + +STRMM_L4x16_SAVE: + + SAVE4x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt STRMM_L4x16_BEGIN + +STRMM_L4x16_END: + +STRMM_L4x8_BEGIN: + andi. T2, M, 15 + ble STRMM_L4x1_END + + andi. T1, M, 8 + ble STRMM_L4x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L4x8_SUB4 + +STRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble STRMM_L4x8_LOOP_END + + .align 5 + +STRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt STRMM_L4x8_LOOP + +STRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b STRMM_L4x8_SUB1 + +STRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b STRMM_L4x8_SUB1 + +STRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble STRMM_L4x8_SAVE + b STRMM_L4x8_SUB2 + +STRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L4x8_SAVE + +STRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt STRMM_L4x8_SUB2 + +STRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +STRMM_L4x8_END: + +STRMM_L4x4_BEGIN: + + andi. T1, M, 4 + ble STRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L4x4_SUB4 + +STRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble STRMM_L4x4_LOOP_END + + .align 5 + +STRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt STRMM_L4x4_LOOP + +STRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b STRMM_L4x4_SUB1 + +STRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b STRMM_L4x4_SUB1 + +STRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble STRMM_L4x4_SAVE + b STRMM_L4x4_SUB2 + +STRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L4x4_SAVE + +STRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt STRMM_L4x4_SUB2 + +STRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +STRMM_L4x4_END: + +STRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble STRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L4x2_SUB4 + +STRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble STRMM_L4x2_LOOP_END + + .align 5 + +STRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt STRMM_L4x2_LOOP + +STRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b STRMM_L4x2_SUB1 + +STRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b STRMM_L4x2_SUB1 + +STRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble STRMM_L4x2_SAVE + b STRMM_L4x2_SUB2 + +STRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L4x2_SAVE + +STRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt STRMM_L4x2_SUB2 + +STRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +STRMM_L4x2_END: + +STRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble STRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L4x1_SUB4 + +STRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble STRMM_L4x1_LOOP_END + + .align 5 + +STRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt STRMM_L4x1_LOOP + +STRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b STRMM_L4x1_SUB1 + +STRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b STRMM_L4x1_SUB1 + +STRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble STRMM_L4x1_SAVE + b STRMM_L4x1_SUB2 + +STRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L4x1_SAVE + +STRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt STRMM_L4x1_SUB2 + +STRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +STRMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + +STRMM_L4_END: +STRMM_L2_BEGIN: + + andi. T1, N, 2 + ble STRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble STRMM_L2x16_END + +STRMM_L2x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L2x16_SUB4 + +STRMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble STRMM_L2x16_LOOP_END + + .align 5 + +STRMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt STRMM_L2x16_LOOP + +STRMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b STRMM_L2x16_SUB1 + +STRMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b STRMM_L2x16_SUB1 + +STRMM_L2x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble STRMM_L2x16_SAVE + b STRMM_L2x16_SUB2 + +STRMM_L2x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L2x16_SAVE + +STRMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt STRMM_L2x16_SUB2 + +STRMM_L2x16_SAVE: + + SAVE2x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt STRMM_L2x16_BEGIN + +STRMM_L2x16_END: + +STRMM_L2x8_BEGIN: + andi. T2, M, 15 + ble STRMM_L2x1_END + + andi. T1, M, 8 + ble STRMM_L2x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L2x8_SUB4 + +STRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble STRMM_L2x8_LOOP_END + + .align 5 + +STRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt STRMM_L2x8_LOOP + +STRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b STRMM_L2x8_SUB1 + +STRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b STRMM_L2x8_SUB1 + +STRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble STRMM_L2x8_SAVE + b STRMM_L2x8_SUB2 + +STRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L2x8_SAVE + +STRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt STRMM_L2x8_SUB2 + +STRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +STRMM_L2x8_END: + +STRMM_L2x4_BEGIN: + + andi. T1, M, 4 + ble STRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L2x4_SUB4 + +STRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble STRMM_L2x4_LOOP_END + + .align 5 + +STRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt STRMM_L2x4_LOOP + +STRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b STRMM_L2x4_SUB1 + +STRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b STRMM_L2x4_SUB1 + +STRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble STRMM_L2x4_SAVE + b STRMM_L2x4_SUB2 + +STRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L2x4_SAVE + +STRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt STRMM_L2x4_SUB2 + +STRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +STRMM_L2x4_END: + +STRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble STRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L2x2_SUB4 + +STRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble STRMM_L2x2_LOOP_END + + .align 5 + +STRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt STRMM_L2x2_LOOP + +STRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b STRMM_L2x2_SUB1 + +STRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b STRMM_L2x2_SUB1 + +STRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble STRMM_L2x2_SAVE + b STRMM_L2x2_SUB2 + +STRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L2x2_SAVE + +STRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt STRMM_L2x2_SUB2 + +STRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +STRMM_L2x2_END: + +STRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble STRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L2x1_SUB4 + +STRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble STRMM_L2x1_LOOP_END + + .align 5 + +STRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt STRMM_L2x1_LOOP + +STRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b STRMM_L2x1_SUB1 + +STRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b STRMM_L2x1_SUB1 + +STRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble STRMM_L2x1_SAVE + b STRMM_L2x1_SUB2 + +STRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L2x1_SAVE + +STRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt STRMM_L2x1_SUB2 + +STRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +STRMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +STRMM_L2_END: +STRMM_L1_BEGIN: + + andi. T1, N, 1 + ble STRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble STRMM_L1x16_END + +STRMM_L1x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L1x16_SUB4 + +STRMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble STRMM_L1x16_LOOP_END + + .align 5 + +STRMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt STRMM_L1x16_LOOP + +STRMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b STRMM_L1x16_SUB1 + +STRMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b STRMM_L1x16_SUB1 + +STRMM_L1x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble STRMM_L1x16_SAVE + b STRMM_L1x16_SUB2 + +STRMM_L1x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L1x16_SAVE + +STRMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt STRMM_L1x16_SUB2 + +STRMM_L1x16_SAVE: + + SAVE1x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt STRMM_L1x16_BEGIN + +STRMM_L1x16_END: + +STRMM_L1x8_BEGIN: + andi. T2, M, 15 + ble STRMM_L1x1_END + + andi. T1, M, 8 + ble STRMM_L1x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L1x8_SUB4 + +STRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble STRMM_L1x8_LOOP_END + + .align 5 + +STRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt STRMM_L1x8_LOOP + +STRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b STRMM_L1x8_SUB1 + +STRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b STRMM_L1x8_SUB1 + +STRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble STRMM_L1x8_SAVE + b STRMM_L1x8_SUB2 + +STRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L1x8_SAVE + +STRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt STRMM_L1x8_SUB2 + +STRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +STRMM_L1x8_END: + +STRMM_L1x4_BEGIN: + + andi. T1, M, 4 + ble STRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L1x4_SUB4 + +STRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble STRMM_L1x4_LOOP_END + + .align 5 + +STRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt STRMM_L1x4_LOOP + +STRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b STRMM_L1x4_SUB1 + +STRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b STRMM_L1x4_SUB1 + +STRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble STRMM_L1x4_SAVE + b STRMM_L1x4_SUB2 + +STRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L1x4_SAVE + +STRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt STRMM_L1x4_SUB2 + +STRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +STRMM_L1x4_END: + +STRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble STRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L1x2_SUB4 + +STRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble STRMM_L1x2_LOOP_END + + .align 5 + +STRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt STRMM_L1x2_LOOP + +STRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b STRMM_L1x2_SUB1 + +STRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b STRMM_L1x2_SUB1 + +STRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble STRMM_L1x2_SAVE + b STRMM_L1x2_SUB2 + +STRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L1x2_SAVE + +STRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt STRMM_L1x2_SUB2 + +STRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +STRMM_L1x2_END: + +STRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble STRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble STRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble STRMM_L1x1_SUB4 + +STRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble STRMM_L1x1_LOOP_END + + .align 5 + +STRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt STRMM_L1x1_LOOP + +STRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b STRMM_L1x1_SUB1 + +STRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b STRMM_L1x1_SUB1 + +STRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble STRMM_L1x1_SAVE + b STRMM_L1x1_SUB2 + +STRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble STRMM_L1x1_SAVE + +STRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt STRMM_L1x1_SUB2 + +STRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +STRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +STRMM_L1_END: diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S new file mode 100644 index 000000000..27bc1e89c --- /dev/null +++ b/kernel/power/strmm_macros_16x8_power8.S @@ -0,0 +1,5840 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro SAVE8x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr +#else + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr +#else + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr +#else + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr +#else + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr +#else + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr +#else + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs37, alpha_vr +#else + xvmaddasp vs0, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs39, alpha_vr +#else + xvmaddasp vs0, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + +.macro LOAD8x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs40, alpha_r + xsmuldp vs1, vs41, alpha_r +#else + xsmaddadp vs0, vs40, alpha_r + xsmaddadp vs1, vs41, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs42, alpha_r + xsmuldp vs1, vs43, alpha_r +#else + xsmaddadp vs0, vs42, alpha_r + xsmaddadp vs1, vs43, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs44, alpha_r + xsmuldp vs1, vs45, alpha_r +#else + xsmaddadp vs0, vs44, alpha_r + xsmaddadp vs1, vs45, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs46, alpha_r + xsmuldp vs1, vs47, alpha_r +#else + xsmaddadp vs0, vs46, alpha_r + xsmaddadp vs1, vs47, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ + +.macro LOAD8x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs37, alpha_r +#else + xsmaddadp vs0, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs39, alpha_r +#else + xsmaddadp vs0, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro SAVE4x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro LOAD2x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro SAVE2x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro LOAD1x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro SAVE1x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c new file mode 100644 index 000000000..abd6ec08a --- /dev/null +++ b/kernel/power/zasum.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "zasum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=4; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + FLOAT svec[2] __attribute__ ((aligned (16)));; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -8; + if ( n1 > 0 ) + { + + zasum_kernel_8(n1, x, svec); + sumf = svec[0] + svec[1]; + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c new file mode 100644 index 000000000..b9f6c0ac6 --- /dev/null +++ b/kernel/power/zasum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "addic. %0 , %0 , -8 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + + "stxvd2x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c new file mode 100644 index 000000000..0ee0c1bf9 --- /dev/null +++ b/kernel/power/zaxpy.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "zaxpy_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_4 + +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT da[4]; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + da[0] = da_r; + da[1] = da_r; + da[2] = da_i; + da[3] = da_i; + zaxpy_kernel_4(n1, x, y , da ); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c new file mode 100644 index 000000000..c8a529fd9 --- /dev/null +++ b/kernel/power/zaxpy_microk_power8.c @@ -0,0 +1,250 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *y2=y+1; + BLASLONG pre = 384; + +#if !defined(CONJ) + FLOAT mvec[2] = { -1.0, 1.0 }; +#else + FLOAT mvec[2] = { 1.0, -1.0 }; +#endif + + + __asm__ __volatile__ + ( + + "lxsdx 34, 0 , %4 \n\t" // alpha_r + "lxsdx 35, %5, %4 \n\t" // alpha_i + "xxspltd 32, 34, 0 \n\t" + "xxspltd 33, 35, 0 \n\t" + + "lxvd2x 36, 0, %9 \n\t" // mvec + +#if !defined(CONJ) + "xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec +#else + "xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec +#endif + + "addi %8, %8, -8 \n\t" + + "dcbt %2, %10 \n\t" + "dcbt %3, %10 \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // x0 + "lxvd2x 41, %5, %2 \n\t" // x1 + "lxvd2x 42, %6, %2 \n\t" // x2 + "lxvd2x 43, %7, %2 \n\t" // x3 + + "lxvd2x 48, 0, %3 \n\t" // y0 + "lxvd2x 49, %5, %3 \n\t" // y1 + "lxvd2x 50, %6, %3 \n\t" // y2 + "lxvd2x 51, %7, %3 \n\t" // y3 + + "xxswapd 56, 40 \n\t" // exchange real and imag part + "xxswapd 57, 41 \n\t" // exchange real and imag part + "xxswapd 58, 42 \n\t" // exchange real and imag part + "xxswapd 59, 43 \n\t" // exchange real and imag part + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "lxvd2x 44, 0, %2 \n\t" // x4 + "lxvd2x 45, %5, %2 \n\t" // x5 + "lxvd2x 46, %6, %2 \n\t" // x6 + "lxvd2x 47, %7, %2 \n\t" // x7 + + "lxvd2x 52, 0, %3 \n\t" // y4 + "lxvd2x 53, %5, %3 \n\t" // y5 + "lxvd2x 54, %6, %3 \n\t" // y6 + "lxvd2x 55, %7, %3 \n\t" // y7 + + "xxswapd 60, 44 \n\t" // exchange real and imag part + "xxswapd 61, 45 \n\t" // exchange real and imag part + "xxswapd 62, 46 \n\t" // exchange real and imag part + "xxswapd 63, 47 \n\t" // exchange real and imag part + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %10 \n\t" + "dcbt %3, %10 \n\t" + + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "lxvd2x 40, 0, %2 \n\t" // x0 + "lxvd2x 41, %5, %2 \n\t" // x1 + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + "lxvd2x 42, %6, %2 \n\t" // x2 + "lxvd2x 43, %7, %2 \n\t" // x3 + + "xvmaddadp 52, 44, 32 \n\t" + "addi %2, %2, 64 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + "lxvd2x 44, 0, %2 \n\t" // x4 + "lxvd2x 45, %5, %2 \n\t" // x5 + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + "lxvd2x 46, %6, %2 \n\t" // x6 + "lxvd2x 47, %7, %2 \n\t" // x7 + + "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 64 \n\t" + "xvmaddadp 49, 57, 33 \n\t" + "xvmaddadp 50, 58, 33 \n\t" + "xvmaddadp 51, 59, 33 \n\t" + + "xvmaddadp 52, 60, 33 \n\t" + "xvmaddadp 53, 61, 33 \n\t" + "xvmaddadp 54, 62, 33 \n\t" + "xvmaddadp 55, 63, 33 \n\t" + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "xxswapd 56, 40 \n\t" // exchange real and imag part + "xxswapd 57, 41 \n\t" // exchange real and imag part + "lxvd2x 48, 0, %3 \n\t" // y0 + "lxvd2x 49, %5, %3 \n\t" // y1 + "xxswapd 58, 42 \n\t" // exchange real and imag part + "xxswapd 59, 43 \n\t" // exchange real and imag part + "lxvd2x 50, %6, %3 \n\t" // y2 + "lxvd2x 51, %7, %3 \n\t" // y3 + + "xxswapd 60, 44 \n\t" // exchange real and imag part + "addi %3, %3, 64 \n\t" + "xxswapd 61, 45 \n\t" // exchange real and imag part + "lxvd2x 52, 0, %3 \n\t" // y4 + "lxvd2x 53, %5, %3 \n\t" // y5 + "xxswapd 62, 46 \n\t" // exchange real and imag part + "xxswapd 63, 47 \n\t" // exchange real and imag part + "lxvd2x 54, %6, %3 \n\t" // y6 + "lxvd2x 55, %7, %3 \n\t" // y7 + + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "xvmaddadp 52, 44, 32 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + + "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddadp 49, 57, 33 \n\t" + "xvmaddadp 50, 58, 33 \n\t" + "xvmaddadp 51, 59, 33 \n\t" + + "xvmaddadp 52, 60, 33 \n\t" + "xvmaddadp 53, 61, 33 \n\t" + "xvmaddadp 54, 62, 33 \n\t" + "xvmaddadp 55, 63, 33 \n\t" + + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (alpha), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (y2), // 8 + "r" (mvec), // 9 + "r" (pre) // 10 + : "cr0", "%0", "%2" , "%3", "%8", "memory" + ); + +} + + diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c new file mode 100644 index 000000000..a7658f7ab --- /dev/null +++ b/kernel/power/zcopy.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "zcopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_16 + +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + zcopy_kernel_16(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c new file mode 100644 index 000000000..73abe084e --- /dev/null +++ b/kernel/power/zcopy_microk_power8.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c new file mode 100644 index 000000000..1205b34b6 --- /dev/null +++ b/kernel/power/zdot.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + + +#if defined(POWER8) +#include "zdot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + dot[0] += x[j+2] * y[j+2] ; + dot[1] += x[j+3] * y[j+3] ; + dot[2] += x[j+2] * y[j+3] ; + dot[3] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[2] += x[j+4] * y[j+5] ; + dot[3] += x[j+5] * y[j+4] ; + + dot[0] += x[j+6] * y[j+6] ; + dot[1] += x[j+7] * y[j+7] ; + dot[2] += x[j+6] * y[j+7] ; + dot[3] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -8; + + if ( n1 ) + zdot_kernel_8(n1, x, y , dot ); + + i = n1; + BLASLONG j = i * 2; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[2] += x[ix] * y[iy+1] ; + dot[3] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[2] + dot[3]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[2] - dot[3]; + +#endif + + return(result); + +} + + diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c new file mode 100644 index 000000000..296d3d469 --- /dev/null +++ b/kernel/power/zdot_microk_power8.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "dcbt %2, %8 \n\t" + "dcbt %3, %8 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i + "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i + "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i + "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i + "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i + "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i + "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i + + "xxswapd 52,48 \n\t" // y0_i, y0_r + "xxswapd 53,49 \n\t" // y1_i, y1_r + "xxswapd 54,50 \n\t" // y2_i, y2_r + "xxswapd 55,51 \n\t" // y3_i, y3_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + + "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i + "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i + "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i + "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i + "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i + "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i + "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i + + "xxswapd 60,56 \n\t" // y0_i, y0_r + "xxswapd 61,57 \n\t" // y1_i, y1_r + "xxswapd 62,58 \n\t" // y2_i, y2_r + "xxswapd 63,59 \n\t" // y3_i, y3_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %8 \n\t" + "dcbt %3, %8 \n\t" + + "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i + "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i + + "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i + "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i + + "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i + + "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r + "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i + "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i + + "xxswapd 52,48 \n\t" // y0_i, y0_r + "xxswapd 53,49 \n\t" // y1_i, y1_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "xxswapd 54,50 \n\t" // y2_i, y2_r + "xxswapd 55,51 \n\t" // y3_i, y3_r + + "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i + "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i + "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i + "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i + "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i + "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i + + "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i + "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i + "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i + "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i + + "xxswapd 60,56 \n\t" // y0_i, y0_r + "xxswapd 61,57 \n\t" // y1_i, y1_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "xxswapd 62,58 \n\t" // y2_i, y2_r + "xxswapd 63,59 \n\t" // y3_i, y3_r + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 33, 33, 35 \n\t" + "xvadddp 37, 37, 39 \n\t" + + "xvadddp 32, 32, 36 \n\t" + "xvadddp 33, 33, 37 \n\t" + + "stxvd2x 32, 0, %4 \n\t" + "stxvd2x 33, %5, %4 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (dot), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (pre) // 8 + : "cr0", "%0", "%2" , "%3", "memory" + ); + +} + + diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index a7665f749..336b13b1f 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,38 +1,3 @@ -/*************************************************************************** -Copyright (c) 2013-2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2016/03/05 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 +#define STACKSIZE 32000 #define ALPHA_R_SP 296(SP) #define ALPHA_I_SP 304(SP) #define FZERO 312(SP) @@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha_r vs30 #define alpha_i vs31 + +#define FRAMEPOINTER r12 + +#define BBUFFER r14 + #define L r15 #define ALPHA r16 #define o24 r17 #define T2 r19 -#define KK r20 +#define BBO r20 #define o8 r21 #define I r22 #define J r23 @@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PROFCODE - addi SP, SP, -STACKSIZE - li r0, 0 + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) @@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef linux #ifdef __64BIT__ - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #else #ifdef DOUBLE - lwz B, FRAMESLOT(0) + STACKSIZE(SP) - lwz C, FRAMESLOT(1) + STACKSIZE(SP) - lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) + lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) + lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) #else - lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #else #ifdef DOUBLE - lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) #else - lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif #endif @@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemm_macros_8x2_power8.S" cmpwi cr0, M, 0 - ble .L999 + ble L999 cmpwi cr0, N, 0 - ble .L999 + ble L999 cmpwi cr0, K, 0 - ble .L999 + ble L999 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 256 + li PRE, 384 li o8 , 8 li o16 , 16 li o24 , 24 li o32 , 32 li o48 , 48 + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 + #ifdef __64BIT__ addi ALPHA, SP, 296 #else addi ALPHA, SP, 224 #endif - lxvdsx alpha_r, 0, ALPHA - lxvdsx alpha_i, o8, ALPHA + lxsdx alpha_r, 0, ALPHA + lxsdx alpha_i, o8, ALPHA - .align 5 + .align 4 #include "zgemm_logic_8x2_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) @@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE blr diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index 5fcade5bf..96612da82 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,83 +1,111 @@ srawi. J, N, 1 - ble .LZGEMM_L2_END + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +ZGEMM_L2_COPYB: + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addic. T1, T1, -1 + addi BBO, BBO, 32 + + bge ZGEMM_L2_COPYB -.LZGEMM_L2_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 - ble .LZGEMM_L2x8_END + ble ZGEMM_L2x8_END -.LZGEMM_L2x8_BEGIN: +ZGEMM_L2x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x8_SUB0 + ble ZGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x8_SUB4 + ble ZGEMM_L2x8_SUB4 -.LZGEMM_L2x8_LOOP_START: +ZGEMM_L2x8_LOOP_START: dcbt AO, PRE + dcbt BO, PRE LOAD2x8_1 dcbt AO, PRE KERNEL2x8_I1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 addic. L, L, -2 - ble .LZGEMM_L2x8_LOOP_END + ble ZGEMM_L2x8_LOOP_END .align 5 -.LZGEMM_L2x8_LOOP: +ZGEMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 addic. L, L, -1 - bgt .LZGEMM_L2x8_LOOP + bgt ZGEMM_L2x8_LOOP -.LZGEMM_L2x8_LOOP_END: +ZGEMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE @@ -88,9 +116,9 @@ KERNEL2x8_1 KERNEL2x8_E2 - b .LZGEMM_L2x8_SUB1 + b ZGEMM_L2x8_SUB1 -.LZGEMM_L2x8_SUB4: +ZGEMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 @@ -106,53 +134,53 @@ KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LZGEMM_L2x8_SUB1 + b ZGEMM_L2x8_SUB1 -.LZGEMM_L2x8_SUB0: +ZGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x8_SAVE - b .LZGEMM_L2x8_SUB2 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 -.LZGEMM_L2x8_SUB1: +ZGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x8_SAVE + ble ZGEMM_L2x8_SAVE -.LZGEMM_L2x8_SUB2: +ZGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x8_SUB2 + bgt ZGEMM_L2x8_SUB2 -.LZGEMM_L2x8_SAVE: +ZGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 - bgt .LZGEMM_L2x8_BEGIN + bgt ZGEMM_L2x8_BEGIN -.LZGEMM_L2x8_END: +ZGEMM_L2x8_END: -.LZGEMM_L2x4_BEGIN: +ZGEMM_L2x4_BEGIN: andi. T2, M, 7 - ble .LZGEMM_L2x1_END + ble ZGEMM_L2x1_END andi. T1, M, 4 - ble .LZGEMM_L2x4_END - mr BO, B + ble ZGEMM_L2x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x4_SUB0 + ble ZGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x4_SUB4 + ble ZGEMM_L2x4_SUB4 -.LZGEMM_L2x4_LOOP_START: +ZGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -166,11 +194,11 @@ KERNEL2x4_2 addic. L, L, -2 - ble .LZGEMM_L2x4_LOOP_END + ble ZGEMM_L2x4_LOOP_END .align 5 -.LZGEMM_L2x4_LOOP: +ZGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -183,9 +211,9 @@ KERNEL2x4_2 addic. L, L, -1 - bgt .LZGEMM_L2x4_LOOP + bgt ZGEMM_L2x4_LOOP -.LZGEMM_L2x4_LOOP_END: +ZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -197,9 +225,9 @@ KERNEL2x4_1 KERNEL2x4_E2 - b .LZGEMM_L2x4_SUB1 + b ZGEMM_L2x4_SUB1 -.LZGEMM_L2x4_SUB4: +ZGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -211,48 +239,48 @@ KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LZGEMM_L2x4_SUB1 + b ZGEMM_L2x4_SUB1 -.LZGEMM_L2x4_SUB0: +ZGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x4_SAVE - b .LZGEMM_L2x4_SUB2 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 -.LZGEMM_L2x4_SUB1: +ZGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x4_SAVE + ble ZGEMM_L2x4_SAVE -.LZGEMM_L2x4_SUB2: +ZGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x4_SUB2 + bgt ZGEMM_L2x4_SUB2 -.LZGEMM_L2x4_SAVE: +ZGEMM_L2x4_SAVE: SAVE2x4 -.LZGEMM_L2x4_END: +ZGEMM_L2x4_END: -.LZGEMM_L2x2_BEGIN: +ZGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LZGEMM_L2x2_END - mr BO, B + ble ZGEMM_L2x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x2_SUB0 + ble ZGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x2_SUB4 + ble ZGEMM_L2x2_SUB4 -.LZGEMM_L2x2_LOOP_START: +ZGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -266,11 +294,11 @@ KERNEL2x2_2 addic. L, L, -2 - ble .LZGEMM_L2x2_LOOP_END + ble ZGEMM_L2x2_LOOP_END .align 5 -.LZGEMM_L2x2_LOOP: +ZGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -283,9 +311,9 @@ KERNEL2x2_2 addic. L, L, -1 - bgt .LZGEMM_L2x2_LOOP + bgt ZGEMM_L2x2_LOOP -.LZGEMM_L2x2_LOOP_END: +ZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -297,9 +325,9 @@ KERNEL2x2_1 KERNEL2x2_E2 - b .LZGEMM_L2x2_SUB1 + b ZGEMM_L2x2_SUB1 -.LZGEMM_L2x2_SUB4: +ZGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -311,48 +339,48 @@ KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LZGEMM_L2x2_SUB1 + b ZGEMM_L2x2_SUB1 -.LZGEMM_L2x2_SUB0: +ZGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x2_SAVE - b .LZGEMM_L2x2_SUB2 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 -.LZGEMM_L2x2_SUB1: +ZGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x2_SAVE + ble ZGEMM_L2x2_SAVE -.LZGEMM_L2x2_SUB2: +ZGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x2_SUB2 + bgt ZGEMM_L2x2_SUB2 -.LZGEMM_L2x2_SAVE: +ZGEMM_L2x2_SAVE: SAVE2x2 -.LZGEMM_L2x2_END: +ZGEMM_L2x2_END: -.LZGEMM_L2x1_BEGIN: +ZGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LZGEMM_L2x1_END - mr BO, B + ble ZGEMM_L2x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x1_SUB0 + ble ZGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x1_SUB4 + ble ZGEMM_L2x1_SUB4 -.LZGEMM_L2x1_LOOP_START: +ZGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -366,11 +394,11 @@ KERNEL2x1_2 addic. L, L, -2 - ble .LZGEMM_L2x1_LOOP_END + ble ZGEMM_L2x1_LOOP_END .align 5 -.LZGEMM_L2x1_LOOP: +ZGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -383,9 +411,9 @@ KERNEL2x1_2 addic. L, L, -1 - bgt .LZGEMM_L2x1_LOOP + bgt ZGEMM_L2x1_LOOP -.LZGEMM_L2x1_LOOP_END: +ZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -397,9 +425,9 @@ KERNEL2x1_1 KERNEL2x1_E2 - b .LZGEMM_L2x1_SUB1 + b ZGEMM_L2x1_SUB1 -.LZGEMM_L2x1_SUB4: +ZGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -411,72 +439,89 @@ KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LZGEMM_L2x1_SUB1 + b ZGEMM_L2x1_SUB1 -.LZGEMM_L2x1_SUB0: +ZGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x1_SAVE - b .LZGEMM_L2x1_SUB2 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 -.LZGEMM_L2x1_SUB1: +ZGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x1_SAVE + ble ZGEMM_L2x1_SAVE -.LZGEMM_L2x1_SUB2: +ZGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x1_SUB2 + bgt ZGEMM_L2x1_SUB2 -.LZGEMM_L2x1_SAVE: +ZGEMM_L2x1_SAVE: SAVE2x1 -.LZGEMM_L2x1_END: +ZGEMM_L2x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LZGEMM_L2_BEGIN + bgt ZGEMM_L2_BEGIN andi. T2, N, 1 - ble .L999 + ble L999 -.LZGEMM_L2_END: +ZGEMM_L2_END: - b .LZGEMM_L1_BEGIN + b ZGEMM_L1_BEGIN -.L999_H1: +L999_H1: - b .L999 + b L999 + +ZGEMM_L1_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 0 + +ZGEMM_L1_COPYB: + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addic. T1, T1, -1 + addi BBO, BBO, 32 + + bge ZGEMM_L1_COPYB -.LZGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LZGEMM_L1_END + ble ZGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 - ble .LZGEMM_L1x8_END + ble ZGEMM_L1x8_END -.LZGEMM_L1x8_BEGIN: +ZGEMM_L1x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x8_SUB0 + ble ZGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x8_SUB4 + ble ZGEMM_L1x8_SUB4 -.LZGEMM_L1x8_LOOP_START: +ZGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -499,11 +544,11 @@ KERNEL1x8_2 addic. L, L, -2 - ble .LZGEMM_L1x8_LOOP_END + ble ZGEMM_L1x8_LOOP_END .align 5 -.LZGEMM_L1x8_LOOP: +ZGEMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 @@ -524,9 +569,9 @@ KERNEL1x8_2 addic. L, L, -1 - bgt .LZGEMM_L1x8_LOOP + bgt ZGEMM_L1x8_LOOP -.LZGEMM_L1x8_LOOP_END: +ZGEMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 @@ -545,9 +590,9 @@ KERNEL1x8_1 KERNEL1x8_E2 - b .LZGEMM_L1x8_SUB1 + b ZGEMM_L1x8_SUB1 -.LZGEMM_L1x8_SUB4: +ZGEMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 @@ -563,53 +608,53 @@ KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LZGEMM_L1x8_SUB1 + b ZGEMM_L1x8_SUB1 -.LZGEMM_L1x8_SUB0: +ZGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x8_SAVE - b .LZGEMM_L1x8_SUB2 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 -.LZGEMM_L1x8_SUB1: +ZGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x8_SAVE + ble ZGEMM_L1x8_SAVE -.LZGEMM_L1x8_SUB2: +ZGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x8_SUB2 + bgt ZGEMM_L1x8_SUB2 -.LZGEMM_L1x8_SAVE: +ZGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 - bgt .LZGEMM_L1x8_BEGIN + bgt ZGEMM_L1x8_BEGIN -.LZGEMM_L1x8_END: +ZGEMM_L1x8_END: -.LZGEMM_L1x4_BEGIN: +ZGEMM_L1x4_BEGIN: andi. T2, M, 7 - ble .LZGEMM_L1x1_END + ble ZGEMM_L1x1_END andi. T1, M, 4 - ble .LZGEMM_L1x4_END - mr BO, B + ble ZGEMM_L1x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x4_SUB0 + ble ZGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x4_SUB4 + ble ZGEMM_L1x4_SUB4 -.LZGEMM_L1x4_LOOP_START: +ZGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -623,11 +668,11 @@ KERNEL1x4_2 addic. L, L, -2 - ble .LZGEMM_L1x4_LOOP_END + ble ZGEMM_L1x4_LOOP_END .align 5 -.LZGEMM_L1x4_LOOP: +ZGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -640,9 +685,9 @@ KERNEL1x4_2 addic. L, L, -1 - bgt .LZGEMM_L1x4_LOOP + bgt ZGEMM_L1x4_LOOP -.LZGEMM_L1x4_LOOP_END: +ZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -654,9 +699,9 @@ KERNEL1x4_1 KERNEL1x4_E2 - b .LZGEMM_L1x4_SUB1 + b ZGEMM_L1x4_SUB1 -.LZGEMM_L1x4_SUB4: +ZGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -668,48 +713,48 @@ KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LZGEMM_L1x4_SUB1 + b ZGEMM_L1x4_SUB1 -.LZGEMM_L1x4_SUB0: +ZGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x4_SAVE - b .LZGEMM_L1x4_SUB2 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 -.LZGEMM_L1x4_SUB1: +ZGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x4_SAVE + ble ZGEMM_L1x4_SAVE -.LZGEMM_L1x4_SUB2: +ZGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x4_SUB2 + bgt ZGEMM_L1x4_SUB2 -.LZGEMM_L1x4_SAVE: +ZGEMM_L1x4_SAVE: SAVE1x4 -.LZGEMM_L1x4_END: +ZGEMM_L1x4_END: -.LZGEMM_L1x2_BEGIN: +ZGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LZGEMM_L1x2_END - mr BO, B + ble ZGEMM_L1x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x2_SUB0 + ble ZGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x2_SUB4 + ble ZGEMM_L1x2_SUB4 -.LZGEMM_L1x2_LOOP_START: +ZGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -723,11 +768,11 @@ KERNEL1x2_2 addic. L, L, -2 - ble .LZGEMM_L1x2_LOOP_END + ble ZGEMM_L1x2_LOOP_END .align 5 -.LZGEMM_L1x2_LOOP: +ZGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -740,9 +785,9 @@ KERNEL1x2_2 addic. L, L, -1 - bgt .LZGEMM_L1x2_LOOP + bgt ZGEMM_L1x2_LOOP -.LZGEMM_L1x2_LOOP_END: +ZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -754,9 +799,9 @@ KERNEL1x2_1 KERNEL1x2_E2 - b .LZGEMM_L1x2_SUB1 + b ZGEMM_L1x2_SUB1 -.LZGEMM_L1x2_SUB4: +ZGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -768,48 +813,48 @@ KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LZGEMM_L1x2_SUB1 + b ZGEMM_L1x2_SUB1 -.LZGEMM_L1x2_SUB0: +ZGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x2_SAVE - b .LZGEMM_L1x2_SUB2 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 -.LZGEMM_L1x2_SUB1: +ZGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x2_SAVE + ble ZGEMM_L1x2_SAVE -.LZGEMM_L1x2_SUB2: +ZGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x2_SUB2 + bgt ZGEMM_L1x2_SUB2 -.LZGEMM_L1x2_SAVE: +ZGEMM_L1x2_SAVE: SAVE1x2 -.LZGEMM_L1x2_END: +ZGEMM_L1x2_END: -.LZGEMM_L1x1_BEGIN: +ZGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LZGEMM_L1x1_END - mr BO, B + ble ZGEMM_L1x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x1_SUB0 + ble ZGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x1_SUB4 + ble ZGEMM_L1x1_SUB4 -.LZGEMM_L1x1_LOOP_START: +ZGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -823,11 +868,11 @@ KERNEL1x1_2 addic. L, L, -2 - ble .LZGEMM_L1x1_LOOP_END + ble ZGEMM_L1x1_LOOP_END .align 5 -.LZGEMM_L1x1_LOOP: +ZGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -840,9 +885,9 @@ KERNEL1x1_2 addic. L, L, -1 - bgt .LZGEMM_L1x1_LOOP + bgt ZGEMM_L1x1_LOOP -.LZGEMM_L1x1_LOOP_END: +ZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -854,9 +899,9 @@ KERNEL1x1_1 KERNEL1x1_E2 - b .LZGEMM_L1x1_SUB1 + b ZGEMM_L1x1_SUB1 -.LZGEMM_L1x1_SUB4: +ZGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -868,34 +913,34 @@ KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LZGEMM_L1x1_SUB1 + b ZGEMM_L1x1_SUB1 -.LZGEMM_L1x1_SUB0: +ZGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x1_SAVE - b .LZGEMM_L1x1_SUB2 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 -.LZGEMM_L1x1_SUB1: +ZGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x1_SAVE + ble ZGEMM_L1x1_SAVE -.LZGEMM_L1x1_SUB2: +ZGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x1_SUB2 + bgt ZGEMM_L1x1_SUB2 -.LZGEMM_L1x1_SAVE: +ZGEMM_L1x1_SAVE: SAVE1x1 -.LZGEMM_L1x1_END: +ZGEMM_L1x1_END: -.LZGEMM_L1_END: +ZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index 701ec65c8..a0fbb2e11 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,39 +1,3 @@ -/*************************************************************************** -Copyright (c) 2013-2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2016/03/05 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - - #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp @@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x8_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x8_1 + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B + + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B - xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - - lxvd2x vs8, o0, AO // load real,imag from A - lxvd2x vs9, o16, AO // load real,imag from A - xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - - lxvd2x vs10, o32, AO // load real,imag from A - lxvd2x vs11, o48, AO // load real,imag from A - xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - - addi AO, AO, 64 - xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag @@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - - lxvd2x vs12, o0, AO // load real,imag from A - lxvd2x vs13, o16, AO // load real,imag from A - xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - - lxvd2x vs14, o32, AO // load real,imag from A - lxvd2x vs15, o48, AO // load real,imag from A - xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag - addi AO, AO, 64 - addi BO, BO, 32 .endm .macro KERNEL2x8_2 + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B + + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - - lxvd2x vs0, o0, AO // load real,imag from A - lxvd2x vs1, o16, AO // load real,imag from A - xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - - lxvd2x vs2, o32, AO // load real,imag from A - lxvd2x vs3, o48, AO // load real,imag from A - xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag - addi AO, AO, 64 - xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag - - lxvd2x vs4, o0, AO // load real,imag from A - lxvd2x vs5, o16, AO // load real,imag from A - xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag - - lxvd2x vs6, o32, AO // load real,imag from A - lxvd2x vs7, o48, AO // load real,imag from A - xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag - - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B - xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag - addi AO, AO, 64 - addi BO, BO, 32 .endm @@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x4_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x2_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x1_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A @@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x8_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x4_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x2_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x1_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A @@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c new file mode 100644 index 000000000..213839a8f --- /dev/null +++ b/kernel/power/zscal.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#include "common.h" + +#pragma GCC optimize "O1" + +#if defined(POWER8) +#include "zscal_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha) +{ + + BLASLONG i=0; + FLOAT *x1=x; + FLOAT alpha_r1=alpha[0]; + FLOAT alpha_r2=alpha[1]; + FLOAT alpha_i1=alpha[2]; + FLOAT alpha_i2=alpha[3]; + FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31; + FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i; + + while ( i 0 ) + { + alpha[0] = da_r; + alpha[1] = da_r; + alpha[2] = -da_i; + alpha[3] = da_i; + zscal_kernel_8(n1, x, alpha); + i=n1; + ip = n1 * 2; + + } + + while ( i < n ) + { + + temp = da_r * x[ip] - da_i * x[ip+1] ; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + ip += 2; + i++; + } + + } + else + { + + inc_x2 = 2 * inc_x; + + while ( i < n ) + { + + temp = da_r * x[ip] - da_i * x[ip+1] ; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + ip += inc_x2; + i++; + } + + + } + + return(0); + +} + + diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c new file mode 100644 index 000000000..5e09d8d79 --- /dev/null +++ b/kernel/power/zscal_microk_power8.c @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* +* I don't use fused multipy-add ( lapack precision problems ) +* +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r + "lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i + "addi %1, %1, -8 \n\t" + + "dcbt %2, %4 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %4 \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + + "xxswapd 56, 40 \n\t" + "xxswapd 57, 41 \n\t" + "xxswapd 58, 42 \n\t" + "xxswapd 59, 43 \n\t" + "xxswapd 60, 44 \n\t" + "xxswapd 61, 45 \n\t" + "xxswapd 62, 46 \n\t" + "xxswapd 63, 47 \n\t" + + "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 57, 57, 33 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 41, %5, %2 \n\t" + + "xvmuldp 58, 58, 33 \n\t" + "xvmuldp 59, 59, 33 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvmuldp 60, 60, 33 \n\t" + "xvmuldp 61, 61, 33 \n\t" + + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + + "xvmuldp 62, 62, 33 \n\t" + "xvmuldp 63, 63, 33 \n\t" + + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "xvadddp 48, 48 , 56 \n\t" + "xvadddp 49, 49 , 57 \n\t" + "xvadddp 50, 50 , 58 \n\t" + "xvadddp 51, 51 , 59 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + + "xvadddp 52, 52 , 60 \n\t" + "xvadddp 53, 53 , 61 \n\t" + + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + + "xvadddp 54, 54 , 62 \n\t" + "xvadddp 55, 55 , 63 \n\t" + + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + + "xxswapd 56, 40 \n\t" + "xxswapd 57, 41 \n\t" + "xxswapd 58, 42 \n\t" + "xxswapd 59, 43 \n\t" + "xxswapd 60, 44 \n\t" + "xxswapd 61, 45 \n\t" + "xxswapd 62, 46 \n\t" + "xxswapd 63, 47 \n\t" + + "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 57, 57, 33 \n\t" + "xvmuldp 58, 58, 33 \n\t" + "xvmuldp 59, 59, 33 \n\t" + "xvmuldp 60, 60, 33 \n\t" + "xvmuldp 61, 61, 33 \n\t" + "xvmuldp 62, 62, 33 \n\t" + "xvmuldp 63, 63, 33 \n\t" + + "xvadddp 48, 48 , 56 \n\t" + "xvadddp 49, 49 , 57 \n\t" + "xvadddp 50, 50 , 58 \n\t" + "xvadddp 51, 51 , 59 \n\t" + "xvadddp 52, 52 , 60 \n\t" + "xvadddp 53, 53 , 61 \n\t" + "xvadddp 54, 54 , 62 \n\t" + "xvadddp 55, 55 , 63 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c new file mode 100644 index 000000000..5ec1eee2e --- /dev/null +++ b/kernel/power/zswap.c @@ -0,0 +1,175 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "zswap_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + zswap_kernel_16(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c new file mode 100644 index 000000000..9e5623752 --- /dev/null +++ b/kernel/power/zswap_microk_power8.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -8 \n\t" + "addi %4, %4, -8 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 48, 0, %1 \n\t" + "lxvd2x 49, %5, %1 \n\t" + "lxvd2x 50, %6, %1 \n\t" + "lxvd2x 51, %7, %1 \n\t" + "lxvd2x 52, %8, %1 \n\t" + "lxvd2x 53, %9, %1 \n\t" + "lxvd2x 54, %10, %1 \n\t" + "lxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "lxvd2x 56, 0, %1 \n\t" + "lxvd2x 57, %5, %1 \n\t" + "lxvd2x 58, %6, %1 \n\t" + "lxvd2x 59, %7, %1 \n\t" + "lxvd2x 60, %8, %1 \n\t" + "lxvd2x 61, %9, %1 \n\t" + "lxvd2x 62, %10, %1 \n\t" + "lxvd2x 63, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 52, %8, %4 \n\t" + "stxvd2x 53, %9, %4 \n\t" + "stxvd2x 54, %10, %4 \n\t" + "stxvd2x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "stxvd2x 56, 0, %4 \n\t" + "stxvd2x 57, %5, %4 \n\t" + "stxvd2x 58, %6, %4 \n\t" + "stxvd2x 59, %7, %4 \n\t" + "stxvd2x 60, %8, %4 \n\t" + "stxvd2x 61, %9, %4 \n\t" + "stxvd2x 62, %10, %4 \n\t" + "stxvd2x 63, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index 8b953765e..0cfe613d5 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#include "zgemm_macros_8x2_power8.S" +#include "ztrmm_macros_8x2_power8.S" cmpwi cr0, M, 0 ble .L999 diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S new file mode 100644 index 000000000..701ec65c8 --- /dev/null +++ b/kernel/power/ztrmm_macros_8x2_power8.S @@ -0,0 +1,3110 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_1 + + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs48 // realA*realB + XSFADD_R2 vs0, vs0, vs49 // imagA*imagB + + xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs48 // realA*imagB + XSFADD_I2 vs1, vs1, vs49 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs50 // realA*realB + XSFADD_R2 vs0, vs0, vs51 // imagA*imagB + + xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs50 // realA*imagB + XSFADD_I2 vs1, vs1, vs51 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs52 // realA*realB + XSFADD_R2 vs0, vs0, vs53 // imagA*imagB + + xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs52 // realA*imagB + XSFADD_I2 vs1, vs1, vs53 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs54 // realA*realB + XSFADD_R2 vs0, vs0, vs55 // imagA*imagB + + xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs54 // realA*imagB + XSFADD_I2 vs1, vs1, vs55 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs56 // realA*realB + XSFADD_R2 vs0, vs0, vs57 // imagA*imagB + + xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs56 // realA*imagB + XSFADD_I2 vs1, vs1, vs57 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs58 // realA*realB + XSFADD_R2 vs0, vs0, vs59 // imagA*imagB + + xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs58 // realA*imagB + XSFADD_I2 vs1, vs1, vs59 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs60 // realA*realB + XSFADD_R2 vs0, vs0, vs61 // imagA*imagB + + xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs60 // realA*imagB + XSFADD_I2 vs1, vs1, vs61 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs62 // realA*realB + XSFADD_R2 vs0, vs0, vs63 // imagA*imagB + + xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs62 // realA*imagB + XSFADD_I2 vs1, vs1, vs63 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL2x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL1x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index f14c82303..4ec748284 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_t_4.c +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index a6da1fea7..a3d20d276 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -72,18 +72,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + double dot = 0.0 ; - FLOAT dot = 0.0 ; + FLOAT mydot=0.0; + BLASLONG n1; if ( n <= 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -32; + n1 = n & (BLASLONG)(-32); if ( n1 ) - sdot_kernel_16(n1, x, y , &dot ); + sdot_kernel_16(n1, x, y , &mydot ); i = n1; @@ -94,12 +96,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) i++ ; } + dot+=mydot; return(dot); } - BLASLONG n1 = n & -2; + n1 = n & (BLASLONG)(-2); while(i < n1) { @@ -124,4 +127,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } - diff --git a/param.h b/param.h index 31125d8e4..a6ead4b64 100644 --- a/param.h +++ b/param.h @@ -1961,35 +1961,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) -#define SNUMOPT 4 +#define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 384 -#define GEMM_DEFAULT_OFFSET_B 1024 +#define GEMM_DEFAULT_OFFSET_A 4096 +#define GEMM_DEFAULT_OFFSET_B 4096 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 992 +#define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 488 -#define ZGEMM_DEFAULT_P 240 +#define CGEMM_DEFAULT_P 720 +#define ZGEMM_DEFAULT_P 480 -#define SGEMM_DEFAULT_Q 504 +#define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 400 -#define ZGEMM_DEFAULT_Q 360 +#define CGEMM_DEFAULT_Q 720 +#define ZGEMM_DEFAULT_Q 720 -#define SGEMM_DEFAULT_R 28800 +#define SGEMM_DEFAULT_R 21600 #define DGEMM_DEFAULT_R 14400 -#define ZGEMM_DEFAULT_R 7200 +#define CGEMM_DEFAULT_R 16200 +#define ZGEMM_DEFAULT_R 21600 #define SYMV_P 8