From 3e8d6ea74f02be704d2f151231906061f695ab8a Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 3 Nov 2015 12:25:05 +0800 Subject: [PATCH 01/37] Init POWER8 kernels by POWER6. --- cpuid_power.c | 6 +++- getarch.c | 13 +++++++++ kernel/power/KERNEL.POWER8 | 56 +++++++++++++++++++++++++++++++++++++ kernel/power/gemm_ncopy_4.S | 5 ++++ kernel/power/gemm_tcopy_4.S | 5 ++++ kernel/power/gemv_n.S | 5 ++++ kernel/power/gemv_t.S | 5 ++++ kernel/power/symv_L.S | 6 +++- kernel/power/symv_U.S | 6 +++- kernel/power/zgemv_n.S | 5 ++++ kernel/power/zgemv_t.S | 5 ++++ kernel/power/zsymv_L.S | 6 +++- kernel/power/zsymv_U.S | 6 +++- param.h | 32 +++++++++++++++++++++ 14 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 kernel/power/KERNEL.POWER8 diff --git a/cpuid_power.c b/cpuid_power.c index 366c6ed08..6790076f6 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -55,6 +55,7 @@ #define CPUTYPE_POWER6 5 #define CPUTYPE_CELL 6 #define CPUTYPE_PPCG4 7 +#define CPUTYPE_POWER8 8 char *cpuname[] = { "UNKNOWN", @@ -65,6 +66,7 @@ char *cpuname[] = { "POWER6", "CELL", "PPCG4", + "POWER8", }; char *lowercpuname[] = { @@ -76,6 +78,7 @@ char *lowercpuname[] = { "power6", "cell", "ppcg4", + "power8", }; char *corename[] = { @@ -87,6 +90,7 @@ char *corename[] = { "POWER6", "CELL", "PPCG4", + "POWER8", }; int detect(void){ @@ -115,7 +119,7 @@ int detect(void){ if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; diff --git a/getarch.c b/getarch.c index fb80a4c9b..ff607a4a5 100644 --- a/getarch.c +++ b/getarch.c @@ -565,6 +565,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER6" #endif +#if defined(FORCE_POWER8) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER8" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER8 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power8" +#define CORENAME "POWER8" +#endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 new file mode 100644 index 000000000..344b205fe --- /dev/null +++ b/kernel/power/KERNEL.POWER8 @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_power6.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_power6.S +CGEMMINCOPY = ../generic/zgemm_ncopy_2.c +CGEMMITCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_power6.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_power6_LN.S +STRSMKERNEL_LT = trsm_kernel_power6_LT.S +STRSMKERNEL_RN = trsm_kernel_power6_LT.S +STRSMKERNEL_RT = trsm_kernel_power6_RT.S + +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S index a4dcc49c1..d7cfe5e97 100644 --- a/kernel/power/gemm_ncopy_4.S +++ b/kernel/power/gemm_ncopy_4.S @@ -104,6 +104,11 @@ #define PREFETCHWSIZE 72 #endif +#ifdef POWER8 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + #ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S index 1b6af4801..46b1cd941 100644 --- a/kernel/power/gemm_tcopy_4.S +++ b/kernel/power/gemm_tcopy_4.S @@ -108,6 +108,11 @@ #define PREFETCHWSIZE 48 #endif +#ifdef POWER8 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + #ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 77587ecb1..5c46c43e2 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -174,6 +174,11 @@ #define PREFETCHSIZE_C 40 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 40 +#endif + #ifndef NEEDPARAM #ifndef __64BIT__ diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 817a60b86..457753065 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -139,6 +139,11 @@ #define PREFETCHSIZE_C 8 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 8 +#endif + #define y01 f0 #define y02 f1 #define y03 f2 diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index f7d768c50..9f759c3f6 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -168,7 +168,11 @@ #define PREFETCHSIZE_A 40 #endif -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#ifdef POWER8 +#define PREFETCHSIZE_A 40 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) #define NOP1 #define NOP2 #else diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index d8e082397..e4e419baf 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -167,7 +167,11 @@ #define PREFETCHSIZE_A 40 #endif -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#ifdef POWER8 +#define PREFETCHSIZE_A 40 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) #define NOP1 #define NOP2 #else diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 23e0177c0..f93439986 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -170,6 +170,11 @@ #define PREFETCHSIZE_C 24 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 24 +#endif + #ifndef XCONJ #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index c0bad3152..2b4501434 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -144,6 +144,11 @@ #define PREFETCHSIZE_C 8 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 8 +#endif + #if !(defined(CONJ) && defined(XCONJ)) #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index b348e328f..394c030fa 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -169,7 +169,11 @@ #define PREFETCHSIZE_A 112 #endif -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#ifdef POWER8 +#define PREFETCHSIZE_A 112 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) #define NOP1 #define NOP2 #else diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index b631cbe35..a061cd77b 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -166,7 +166,11 @@ #define PREFETCHSIZE_A 112 #endif -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#ifdef POWER8 +#define PREFETCHSIZE_A 112 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) #define NOP1 #define NOP2 #else diff --git a/param.h b/param.h index 962f80ef3..c46a1e999 100644 --- a/param.h +++ b/param.h @@ -1959,6 +1959,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(POWER8) + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 384 +#define GEMM_DEFAULT_OFFSET_B 1024 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 992 +#define DGEMM_DEFAULT_P 480 +#define CGEMM_DEFAULT_P 488 +#define ZGEMM_DEFAULT_P 248 + +#define SGEMM_DEFAULT_Q 504 +#define DGEMM_DEFAULT_Q 504 +#define CGEMM_DEFAULT_Q 400 +#define ZGEMM_DEFAULT_Q 400 + +#define SYMV_P 8 + +#endif + #if defined(SPARC) && defined(V7) #define SNUMOPT 4 From 6b85dbb6dcf09261568d2297833f94150ab4c9e7 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 24 Feb 2016 14:18:39 -0500 Subject: [PATCH 02/37] Refs #696. Turn off stack limit setting on Linux. I cannot reproduce SEGFAULT of lapack-test with default stack size on ARM Linux. --- driver/others/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 46623a52e..e0761d784 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1365,7 +1365,8 @@ void CONSTRUCTOR gotoblas_init(void) { gotoblas_memory_init(); #endif -#if defined(OS_LINUX) +//#if defined(OS_LINUX) +#if 0 struct rlimit curlimit; if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) { From f68141cf1d9181432ba0bd0806f45985543f2b19 Mon Sep 17 00:00:00 2001 From: Petr Cerny Date: Sat, 27 Feb 2016 16:57:22 +0100 Subject: [PATCH 03/37] collected usage notes --- USAGE.md | 199 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 USAGE.md diff --git a/USAGE.md b/USAGE.md new file mode 100644 index 000000000..c76ceb324 --- /dev/null +++ b/USAGE.md @@ -0,0 +1,199 @@ +# Notes on OpenBLAS usage +## Usage + +#### Program is Terminated. Because you tried to allocate too many memory regions + +In OpenBLAS, we mange a pool of memory buffers and allocate the number of +buffers as the following. +``` +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +``` +This error indicates that the program exceeded the number of buffers. + +Please build OpenBLAS with larger `NUM_THREADS`. For example, `make +NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set +`MAX_CPU_NUMBER=NUM_THREADS`. + +#### How can I use OpenBLAS in multi-threaded applications? + +If your application is already multi-threaded, it will conflict with OpenBLAS +multi-threading. Thus, you must set OpenBLAS to use single thread in any of the +following ways: + +* `export OPENBLAS_NUM_THREADS=1` in the environment variables. +* Call `openblas_set_num_threads(1)` in the application on runtime. +* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0` + +If the application is parallelized by OpenMP, please use OpenBLAS built with +`USE_OPENMP=1` + +#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH + +The environment variable which control the kernel selection is +`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export +OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()` +returns the used target. + +#### How could I disable OpenBLAS threading affinity on runtime? + +You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment +variable to disable threading affinity on runtime. For example, before the +running, +``` +export OPENBLAS_MAIN_FREE=1 +``` + +Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1` +in `Makefile.rule`. + +## Linking with the library + +* Link with shared library + +`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas` + +If the library is multithreaded, please add `-lpthread`. If the library +contains LAPACK functions, please add `-lgfortran` or other Fortran libs. + +* Link with static library + +`gcc -o test test.c /your/path/libopenblas.a` + +You can download `test.c` from https://gist.github.com/xianyi/5780018 + +On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by +default), custom programs statically linked against `libopenblas.a` should also +link with the pthread library e.g.: + +``` +gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread +``` + +Failing to add the `-lpthread` flag will cause errors such as: + +``` +/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory': +memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock' +memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock' +... +``` + +## Code examples + +#### Call CBLAS interface +This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656 +``` +#include +#include + +void main() +{ + int i=0; + double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; + double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; + double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5}; + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3); + + for(i=0; i<9; i++) + printf("%lf ", C[i]); + printf("\n"); +} +``` +`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran` + +#### Call BLAS Fortran interface + +This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018 + +``` +#include "stdio.h" +#include "stdlib.h" +#include "sys/time.h" +#include "time.h" + +extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*); + +int main(int argc, char* argv[]) +{ + int i; + printf("test!\n"); + if(argc<4){ + printf("Input Error\n"); + return 1; + } + + int m = atoi(argv[1]); + int n = atoi(argv[2]); + int k = atoi(argv[3]); + int sizeofa = m * k; + int sizeofb = k * n; + int sizeofc = m * n; + char ta = 'N'; + char tb = 'N'; + double alpha = 1.2; + double beta = 0.001; + + struct timeval start,finish; + double duration; + + double* A = (double*)malloc(sizeof(double) * sizeofa); + double* B = (double*)malloc(sizeof(double) * sizeofb); + double* C = (double*)malloc(sizeof(double) * sizeofc); + + srand((unsigned)time(NULL)); + + for (i=0; i ` + +## Troubleshooting +* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. +* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. +* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. + +## BLAS reference manual +If you want to understand every BLAS function and definition, please read +[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm) +or [netlib.org](http://netlib.org/blas/) + +Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions) + +## How to reference OpenBLAS. + +You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications). + +Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly. + From efa4f5c936b6dcf61809c5068ca6362632643b56 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 1 Mar 2016 11:18:56 +0800 Subject: [PATCH 04/37] Refs #695 #783. Replace default x86_64 cgemv_t asm kernel by C kernel. --- kernel/x86_64/KERNEL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 2dcc8658b..56850f791 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -393,7 +393,7 @@ CGEMVNKERNEL = cgemv_n.S endif ifndef CGEMVTKERNEL -CGEMVTKERNEL = cgemv_t.S +CGEMVTKERNEL = ../arm/zgemv_t.c endif ifndef ZGEMVNKERNEL From 4fc8c937d4de4d1702f5255a8fe69401f613b9e2 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 1 Mar 2016 01:05:56 -0500 Subject: [PATCH 05/37] Refs #695 add testcase. --- utest/CMakeLists.txt | 7 +++ utest/Makefile | 4 ++ utest/ctest.h | 11 ++++- utest/openblas_utest.h | 1 + utest/test_potrs.c | 96 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 utest/test_potrs.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 6c7788d97..dfa42df67 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -5,6 +5,13 @@ set(OpenBLAS_utest_src test_amax.c ) +if (NOT NO_LAPACK) +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} + test_potrs.c + ) +endif() + set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) diff --git a/utest/Makefile b/utest/Makefile index 716b1c784..9f9808920 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,6 +11,10 @@ include $(TOPDIR)/Makefile.system OBJS=utest_main.o test_amax.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_fork.o +ifneq ($(NO_LAPACK), 1) +OBJS += test_potrs.o +endif + all : run_test $(UTESTBIN): $(OBJS) diff --git a/utest/ctest.h b/utest/ctest.h index 01c50f73b..6d859bc4f 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2015 Bas van den Berg +/* Copyright 2011-2016 Bas van den Berg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -212,6 +212,9 @@ void assert_not_equal(intmax_t exp, intmax_t real, const char* caller, int line) void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line); #define ASSERT_NOT_EQUAL_U(exp, real) assert_not_equal_u(exp, real, __FILE__, __LINE__) +void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line); +#define ASSERT_INTERVAL(exp1, exp2, real) assert_interval(exp1, exp2, real, __FILE__, __LINE__) + void assert_null(void* real, const char* caller, int line); #define ASSERT_NULL(real) assert_null((void*)real, __FILE__, __LINE__) @@ -511,6 +514,12 @@ void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int l } } +void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line) { + if (real < exp1 || real > exp2) { + CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real); + } +} + void assert_dbl_near(double exp, double real, double tol, const char* caller, int line) { double diff = exp - real; double absdiff = diff; diff --git a/utest/openblas_utest.h b/utest/openblas_utest.h index fb70fdc27..abe381a92 100644 --- a/utest/openblas_utest.h +++ b/utest/openblas_utest.h @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ctest.h" #include +#include #define SINGLE_EPS 1e-04 #define DOUBLE_EPS 1e-13 diff --git a/utest/test_potrs.c b/utest/test_potrs.c new file mode 100644 index 000000000..41b3f6492 --- /dev/null +++ b/utest/test_potrs.c @@ -0,0 +1,96 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +/* +void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); +void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, + BLASINT*, complex double*, BLASINT*, BLASINT*); +*/ + + +//https://github.com/xianyi/OpenBLAS/issues/695 +CTEST(potrf, bug_695){ + + openblas_complex_float A1[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I, + -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I, + 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I, + 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I, + -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I, + 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I, + 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I, + 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I, + 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I, + 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I}; + char up = 'U'; + + blasint n=10; + blasint info[1]; + BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); + //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); + + openblas_complex_double A2[100] = {3.0607147216796875+0.0*I, -0.5905849933624268-0.29020825028419495*I, 0.321084201335907+0.45168760418891907*I, 0.8387917876243591-0.644718587398529*I, -0.3642411530017853+0.051274992525577545*I, 0.8071482181549072+0.33944568037986755*I, 0.013674172572791576+0.21422699093818665*I, 0.35476258397102356+0.42408594489097595*I, -0.5991537570953369-0.23082709312438965*I, -0.0600702166557312-0.2113417387008667*I, + -0.7954045534133911+0.7066076993942261*I, 2.807175397872925+0.0*I, -0.1691000759601593+0.313548743724823*I, -0.30911174416542053+0.7447023987770081*I, -0.22347848117351532+0.03316075727343559*I, -0.4088296890258789-1.0214389562606812*I, -0.2344931811094284+0.08056317269802094*I, 0.793269693851471-0.17507623136043549*I, 0.03163455054163933+0.20559945702552795*I, 0.13581633567810059-0.2110036462545395*I, + 0.9827471375465393+1.3824869394302368*I, -1.8076121807098389-0.8882446885108948*I, 2.3277781009674072+0.0*I, 0.830405056476593-0.19296252727508545*I, 0.1394239068031311-0.5260677933692932*I, 1.239942193031311-0.09915469586849213*I, 0.06731037050485611-0.059320636093616486*I, 0.11507681757211685-0.1984301060438156*I, -0.6843825578689575+0.4647614359855652*I, 1.213119387626648-0.7757048010826111*I, + 2.619997978210449+1.8532984256744385*I, 0.4780699610710144+0.48494184017181396*I, -0.18385779857635498+0.6468567848205566*I, 2.0811400413513184+0.0*I, -0.035075582563877106+0.09732913225889206*I, 0.27337002754211426-0.9032229781150818*I, -0.8374675512313843+0.0479498989880085*I, 0.6916252374649048+0.45711082220077515*I, 0.1883818507194519+0.06482727080583572*I, -0.32384994626045227+0.05857187137007713*I, + -1.8306152820587158-1.2336910963058472*I, 0.5096428990364075-0.5395973920822144*I, -1.833838701248169+0.7064958810806274*I, -1.956626057624817+0.22825956344604492*I, 1.706615924835205+0.0*I, -0.2895336151123047+0.17579378187656403*I, -0.923172116279602-0.4530014097690582*I, 0.5040621757507324-0.37026339769363403*I, -0.2824432849884033-1.0374568700790405*I, 0.1399831622838974+0.4977008104324341*I, + 0.32275113463401794+0.015575028955936432*I, -0.7285097241401672-0.10360407829284668*I, 0.041852742433547974-0.655687689781189*I, 0.07081800699234009-0.318013072013855*I, -0.25947219133377075+0.4878614842891693*I, 1.5735365152359009+0.0*I, -0.2647853195667267-0.26654252409935*I, -0.6190430521965027-0.24699924886226654*I, -0.6288471221923828+0.48154571652412415*I, 0.02446540631353855-0.2611822783946991*I, + 2.1968812942504883+1.0640623569488525*I, -1.1760060787200928-2.714695692062378*I, 2.5673024654388428+1.9732997417449951*I, 0.3698374927043915-0.54008549451828*I, -0.4763622283935547-0.27821826934814453*I, -1.6697118282318115+0.4017511010169983*I, 1.2674795389175415+0.0*I, 0.3079095482826233-0.07258892804384232*I, -0.5929520130157471-0.038360968232154846*I, 0.04388086497783661-0.025549031794071198*I, + 0.27894386649131775+0.9791183471679688*I, -0.42710840702056885+0.0428999662399292*I, -1.1148382425308228-0.1569381207227707*I, 0.8068630695343018+1.5315914154052734*I, -0.6160865426063538-2.0185799598693848*I, -1.439787745475769-0.7550917863845825*I, -0.10051321983337402+0.24303960800170898*I, 0.9066106081008911+0.0*I, 0.05315789580345154-0.06136537343263626*I, -0.21304509043693542+0.6494344472885132*I, + 3.0476584434509277+0.1854848861694336*I, -1.7228562831878662+2.8335886001586914*I, 2.4704504013061523-1.0389463901519775*I, 1.564915418624878-1.6229296922683716*I, -2.7767486572265625+1.769376516342163*I, -0.314566969871521-1.0403450727462769*I, 1.4415971040725708+0.29750674962997437*I, -0.5856801271438599-1.0203559398651123*I, 0.5668219923973083+0.0*I, 0.033351436257362366-0.07832501083612442*I, + 0.3842993974685669+0.7050991058349609*I, 1.894256591796875+0.6389734745025635*I, 1.085827112197876-1.2980060577392578*I, -0.11207738518714905+1.2014245986938477*I, 0.04810279607772827-0.9741873741149902*I, -0.31978556513786316+0.13701045513153076*I, 1.2217860221862793-0.856549859046936*I, 0.7103452086448669+0.84221351146698*I, -0.9617416858673096-1.2486815452575684*I, 0.0756804421544075+0.0*I}; + openblas_complex_double B[20] = {-0.21782716937787788-0.9222220085490986*I, -0.7620356655676837+0.15533508334193666*I, -0.905011814118756+0.2847570854574069*I, -0.3451346708401685+1.076948486041297*I, 0.25336108035924787+0.975317836492159*I, 0.11192755545114-0.1603741874112385*I, -0.20604111555491242+0.10570814584017311*I, -1.0568488936791578-0.06025820467086475*I, -0.6650468984506477-0.5000967284800251*I, -1.0509472322215125+0.5022165705328413*I, + -0.727775859267237+0.50638268521728*I, 0.39947219167701153-0.4576746001199889*I, -0.7122162951294634-0.630289556702497*I, 0.9870834574024372-0.2825689605519449*I, 0.0628393808469436-0.1253397353973715*I, 0.8439562576196216+1.0850814110398734*I, 0.562377322638969-0.2578030745663871*I, 0.12696236014017806-0.09853584666755086*I, -0.023682508769195098+0.18093440285319276*I, -0.7264975746431271+0.31670415674097235*I}; + char lo = 'L'; + blasint nrhs = 2; + BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); + + // note that this is exactly equal to A1 + openblas_complex_float A3[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I, + -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I, + 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I, + 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I, + -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I, + 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I, + 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I, + 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I, + 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I, + 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I}; + + BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info); + // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91])); + if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { + CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__); + } +} From b752858d6c37c0aa393c4a0636d3cda2ff2da179 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 1 Mar 2016 07:33:56 +0100 Subject: [PATCH 06/37] added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8 --- common_power.h | 9 +- cpuid_power.c | 6 +- getarch.c | 5 +- kernel/Makefile.L3 | 5 + kernel/power/KERNEL | 54 - kernel/power/KERNEL.POWER8 | 217 +- kernel/power/def_vsx.h | 64 + kernel/power/dgemm_kernel_16x4_power8.S | 313 +++ kernel/power/dgemm_logic_16x4_power8.S | 1647 +++++++++++ kernel/power/dgemm_macros_16x4_power8.S | 3400 +++++++++++++++++++++++ kernel/power/dtrmm_kernel_16x4_power8.S | 327 +++ kernel/power/dtrmm_logic_16x4_power8.S | 2202 +++++++++++++++ kernel/power/gemm_ncopy_4.S | 6 +- kernel/power/gemm_tcopy_4.S | 6 +- kernel/power/gemv_n.S | 5 - kernel/power/gemv_t.S | 5 - kernel/power/symv_L.S | 6 +- kernel/power/symv_U.S | 6 +- kernel/power/zgemm_kernel_8x2_power8.S | 332 +++ kernel/power/zgemm_logic_8x2_power8.S | 901 ++++++ kernel/power/zgemm_macros_8x2_power8.S | 3074 ++++++++++++++++++++ kernel/power/zgemv_n.S | 5 - kernel/power/zgemv_t.S | 5 - kernel/power/zsymv_L.S | 6 +- kernel/power/zsymv_U.S | 6 +- kernel/power/ztrmm_kernel_8x2_power8.S | 342 +++ kernel/power/ztrmm_logic_8x2_power8.S | 1201 ++++++++ param.h | 24 +- 28 files changed, 14013 insertions(+), 166 deletions(-) create mode 100644 kernel/power/def_vsx.h create mode 100644 kernel/power/dgemm_kernel_16x4_power8.S create mode 100644 kernel/power/dgemm_logic_16x4_power8.S create mode 100644 kernel/power/dgemm_macros_16x4_power8.S create mode 100644 kernel/power/dtrmm_kernel_16x4_power8.S create mode 100644 kernel/power/dtrmm_logic_16x4_power8.S create mode 100644 kernel/power/zgemm_kernel_8x2_power8.S create mode 100644 kernel/power/zgemm_logic_8x2_power8.S create mode 100644 kernel/power/zgemm_macros_8x2_power8.S create mode 100644 kernel/power/ztrmm_kernel_8x2_power8.S create mode 100644 kernel/power/ztrmm_logic_8x2_power8.S diff --git a/common_power.h b/common_power.h index ab331b04a..64e052f3d 100644 --- a/common_power.h +++ b/common_power.h @@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif +#if defined(POWER8) +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (16 + 128 * 100) +#define L1_PREFETCH dcbtst +#endif + +# #ifndef L1_PREFETCH #define L1_PREFETCH dcbt #endif diff --git a/cpuid_power.c b/cpuid_power.c index 6790076f6..951204ae9 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -66,7 +66,7 @@ char *cpuname[] = { "POWER6", "CELL", "PPCG4", - "POWER8", + "POWER8" }; char *lowercpuname[] = { @@ -78,7 +78,7 @@ char *lowercpuname[] = { "power6", "cell", "ppcg4", - "power8", + "power8" }; char *corename[] = { @@ -90,7 +90,7 @@ char *corename[] = { "POWER6", "CELL", "PPCG4", - "POWER8", + "POWER8" }; int detect(void){ diff --git a/getarch.c b/getarch.c index ff607a4a5..f9c49e663 100644 --- a/getarch.c +++ b/getarch.c @@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER5" #endif -#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) +#if defined(FORCE_POWER6) || defined(FORCE_POWER7) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER6" @@ -565,7 +565,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER6" #endif -#if defined(FORCE_POWER8) +#if defined(FORCE_POWER8) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER8" @@ -578,6 +578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER8" #endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 63e675b8d..8e6827424 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif +ifeq ($(CORE), POWER8) +USE_TRMM = 1 +endif + + SKERNELOBJS += \ diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index cb9ed848b..eae60cdcc 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -1,57 +1,3 @@ -SGEMM_BETA = gemm_beta.S -DGEMM_BETA = gemm_beta.S -CGEMM_BETA = zgemm_beta.S -ZGEMM_BETA = zgemm_beta.S - - -ifndef SSYMV_U_KERNEL -SSYMV_U_KERNEL = symv_U.S -endif - -ifndef SSYMV_L_KERNEL -SSYMV_L_KERNEL = symv_L.S -endif - -ifndef DSYMV_U_KERNEL -DSYMV_U_KERNEL = symv_U.S -endif - -ifndef DSYMV_L_KERNEL -DSYMV_L_KERNEL = symv_L.S -endif - -ifndef CSYMV_U_KERNEL -CSYMV_U_KERNEL = zsymv_U.S -endif - -ifndef CSYMV_L_KERNEL -CSYMV_L_KERNEL = zsymv_L.S -endif - -ifndef ZSYMV_U_KERNEL -ZSYMV_U_KERNEL = zsymv_U.S -endif - -ifndef ZSYMV_L_KERNEL -ZSYMV_L_KERNEL = zsymv_L.S -endif - -ifndef CHEMV_U_KERNEL -CHEMV_U_KERNEL = zsymv_U.S -endif - -ifndef CHEMV_L_KERNEL -CHEMV_L_KERNEL = zsymv_L.S -endif - -ifndef ZHEMV_U_KERNEL -ZHEMV_U_KERNEL = zsymv_U.S -endif - -ifndef ZHEMV_L_KERNEL -ZHEMV_L_KERNEL = zsymv_L.S -endif - ifndef STRSMKERNEL_LN STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c endif diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 344b205fe..3a627e441 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,56 +1,173 @@ -SGEMMKERNEL = gemm_kernel_power6.S -SGEMMINCOPY = -SGEMMITCOPY = -SGEMMONCOPY = gemm_ncopy_4.S -SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = gemm_kernel_power6.S -DGEMMINCOPY = -DGEMMITCOPY = +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = dtrmm_kernel_16x4_power8.S +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_16x4_power8.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = zgemm_kernel_power6.S -CGEMMINCOPY = ../generic/zgemm_ncopy_2.c -CGEMMITCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_4.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_power6.S -ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o -STRSMKERNEL_LN = trsm_kernel_power6_LN.S -STRSMKERNEL_LT = trsm_kernel_power6_LT.S -STRSMKERNEL_RN = trsm_kernel_power6_LT.S -STRSMKERNEL_RT = trsm_kernel_power6_RT.S +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o -DTRSMKERNEL_LN = trsm_kernel_power6_LN.S -DTRSMKERNEL_LT = trsm_kernel_power6_LT.S -DTRSMKERNEL_RN = trsm_kernel_power6_LT.S -DTRSMKERNEL_RT = trsm_kernel_power6_RT.S +ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o -CTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S -CTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S -CTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S -CTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S -ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S -ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S -ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/power/def_vsx.h b/kernel/power/def_vsx.h new file mode 100644 index 000000000..c2d29e268 --- /dev/null +++ b/kernel/power/def_vsx.h @@ -0,0 +1,64 @@ +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 +#define vs14 14 +#define vs15 15 +#define vs16 16 +#define vs17 17 +#define vs18 18 +#define vs19 19 +#define vs20 20 +#define vs21 21 +#define vs22 22 +#define vs23 23 +#define vs24 24 +#define vs25 25 +#define vs26 26 +#define vs27 27 +#define vs28 28 +#define vs29 29 +#define vs30 30 +#define vs31 31 +#define vs32 32 +#define vs33 33 +#define vs34 34 +#define vs35 35 +#define vs36 36 +#define vs37 37 +#define vs38 38 +#define vs39 39 +#define vs40 40 +#define vs41 41 +#define vs42 42 +#define vs43 43 +#define vs44 44 +#define vs45 45 +#define vs46 46 +#define vs47 47 +#define vs48 48 +#define vs49 49 +#define vs50 50 +#define vs51 51 +#define vs52 52 +#define vs53 53 +#define vs54 54 +#define vs55 55 +#define vs56 56 +#define vs57 57 +#define vs58 58 +#define vs59 59 +#define vs60 60 +#define vs61 61 +#define vs62 62 +#define vs63 63 diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S new file mode 100644 index 000000000..53205ade8 --- /dev/null +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -0,0 +1,313 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs18 + +#define o0 0 + +#define o8 r15 +#define o24 r16 +#define ALPHA r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_16x4_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + stfd f1, ALPHA_SP + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + li PRE, 256 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + lxvdsx alpha_r, 0, ALPHA + +#include "dgemm_logic_16x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S new file mode 100644 index 000000000..e19f78b8d --- /dev/null +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -0,0 +1,1647 @@ + srawi. J, N, 2 + ble DGEMM_L4_END + +DGEMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 4 + ble DGEMM_L4x16_END + +DGEMM_L4x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x16_SUB4 + +DGEMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble DGEMM_L4x16_LOOP_END + + .align 5 + +DGEMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt DGEMM_L4x16_LOOP + +DGEMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b DGEMM_L4x16_SUB1 + +DGEMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b DGEMM_L4x16_SUB1 + +DGEMM_L4x16_SUB0: + + andi. L, K, 7 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x16_SAVE + b DGEMM_L4x16_SUB2 + +DGEMM_L4x16_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x16_SAVE + +DGEMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x16_SUB2 + +DGEMM_L4x16_SAVE: + + SAVE4x16 + + addic. I, I, -1 + bgt DGEMM_L4x16_BEGIN + +DGEMM_L4x16_END: + +DGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble DGEMM_L4x1_END + + andi. T1, M, 8 + ble DGEMM_L4x8_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x8_SUB4 + +DGEMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble DGEMM_L4x8_LOOP_END + + .align 5 + +DGEMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt DGEMM_L4x8_LOOP + +DGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b DGEMM_L4x8_SUB1 + +DGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b DGEMM_L4x8_SUB1 + +DGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x8_SAVE + b DGEMM_L4x8_SUB2 + +DGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x8_SAVE + +DGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x8_SUB2 + +DGEMM_L4x8_SAVE: + + SAVE4x8 + +DGEMM_L4x8_END: + +DGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble DGEMM_L4x4_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x4_SUB4 + +DGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble DGEMM_L4x4_LOOP_END + + .align 5 + +DGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt DGEMM_L4x4_LOOP + +DGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b DGEMM_L4x4_SUB1 + +DGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b DGEMM_L4x4_SUB1 + +DGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x4_SAVE + b DGEMM_L4x4_SUB2 + +DGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x4_SAVE + +DGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x4_SUB2 + +DGEMM_L4x4_SAVE: + + SAVE4x4 + +DGEMM_L4x4_END: + +DGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble DGEMM_L4x2_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x2_SUB4 + +DGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble DGEMM_L4x2_LOOP_END + + .align 5 + +DGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt DGEMM_L4x2_LOOP + +DGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b DGEMM_L4x2_SUB1 + +DGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b DGEMM_L4x2_SUB1 + +DGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x2_SAVE + b DGEMM_L4x2_SUB2 + +DGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x2_SAVE + +DGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x2_SUB2 + +DGEMM_L4x2_SAVE: + + SAVE4x2 + +DGEMM_L4x2_END: + +DGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble DGEMM_L4x1_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x1_SUB4 + +DGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble DGEMM_L4x1_LOOP_END + + .align 5 + +DGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt DGEMM_L4x1_LOOP + +DGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b DGEMM_L4x1_SUB1 + +DGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b DGEMM_L4x1_SUB1 + +DGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x1_SAVE + b DGEMM_L4x1_SUB2 + +DGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x1_SAVE + +DGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x1_SUB2 + +DGEMM_L4x1_SAVE: + + SAVE4x1 + +DGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt DGEMM_L4_BEGIN + + andi. T2, N, 3 + ble L999 + +DGEMM_L4_END: + + b DGEMM_L2_BEGIN + +L999_H1: + + b L999 + +DGEMM_L2_BEGIN: + + andi. T1, N, 2 + ble DGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble DGEMM_L2x16_END + +DGEMM_L2x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x16_SUB4 + +DGEMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble DGEMM_L2x16_LOOP_END + + .align 5 + +DGEMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt DGEMM_L2x16_LOOP + +DGEMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b DGEMM_L2x16_SUB1 + +DGEMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b DGEMM_L2x16_SUB1 + +DGEMM_L2x16_SUB0: + + andi. L, K, 7 + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x16_SAVE + b DGEMM_L2x16_SUB2 + +DGEMM_L2x16_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x16_SAVE + +DGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x16_SUB2 + +DGEMM_L2x16_SAVE: + + SAVE2x16 + + addic. I, I, -1 + bgt DGEMM_L2x16_BEGIN + +DGEMM_L2x16_END: + +DGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble DGEMM_L2x1_END + + andi. T1, M, 8 + ble DGEMM_L2x8_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x8_SUB4 + +DGEMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble DGEMM_L2x8_LOOP_END + + .align 5 + +DGEMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt DGEMM_L2x8_LOOP + +DGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b DGEMM_L2x8_SUB1 + +DGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b DGEMM_L2x8_SUB1 + +DGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x8_SAVE + b DGEMM_L2x8_SUB2 + +DGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x8_SAVE + +DGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x8_SUB2 + +DGEMM_L2x8_SAVE: + + SAVE2x8 + +DGEMM_L2x8_END: + +DGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble DGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x4_SUB4 + +DGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble DGEMM_L2x4_LOOP_END + + .align 5 + +DGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt DGEMM_L2x4_LOOP + +DGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b DGEMM_L2x4_SUB1 + +DGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b DGEMM_L2x4_SUB1 + +DGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x4_SAVE + b DGEMM_L2x4_SUB2 + +DGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x4_SAVE + +DGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x4_SUB2 + +DGEMM_L2x4_SAVE: + + SAVE2x4 + +DGEMM_L2x4_END: + +DGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble DGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x2_SUB4 + +DGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble DGEMM_L2x2_LOOP_END + + .align 5 + +DGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt DGEMM_L2x2_LOOP + +DGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b DGEMM_L2x2_SUB1 + +DGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b DGEMM_L2x2_SUB1 + +DGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x2_SAVE + b DGEMM_L2x2_SUB2 + +DGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x2_SAVE + +DGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x2_SUB2 + +DGEMM_L2x2_SAVE: + + SAVE2x2 + +DGEMM_L2x2_END: + +DGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble DGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x1_SUB4 + +DGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble DGEMM_L2x1_LOOP_END + + .align 5 + +DGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt DGEMM_L2x1_LOOP + +DGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b DGEMM_L2x1_SUB1 + +DGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b DGEMM_L2x1_SUB1 + +DGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x1_SAVE + b DGEMM_L2x1_SUB2 + +DGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x1_SAVE + +DGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x1_SUB2 + +DGEMM_L2x1_SAVE: + + SAVE2x1 + +DGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +DGEMM_L2_END: +DGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble DGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble DGEMM_L1x16_END + +DGEMM_L1x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x16_SUB4 + +DGEMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble DGEMM_L1x16_LOOP_END + + .align 5 + +DGEMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt DGEMM_L1x16_LOOP + +DGEMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b DGEMM_L1x16_SUB1 + +DGEMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b DGEMM_L1x16_SUB1 + +DGEMM_L1x16_SUB0: + + andi. L, K, 7 + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x16_SAVE + b DGEMM_L1x16_SUB2 + +DGEMM_L1x16_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x16_SAVE + +DGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x16_SUB2 + +DGEMM_L1x16_SAVE: + + SAVE1x16 + + addic. I, I, -1 + bgt DGEMM_L1x16_BEGIN + +DGEMM_L1x16_END: + +DGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble DGEMM_L1x1_END + + andi. T1, M, 8 + ble DGEMM_L1x8_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x8_SUB4 + +DGEMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble DGEMM_L1x8_LOOP_END + + .align 5 + +DGEMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt DGEMM_L1x8_LOOP + +DGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b DGEMM_L1x8_SUB1 + +DGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b DGEMM_L1x8_SUB1 + +DGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x8_SAVE + b DGEMM_L1x8_SUB2 + +DGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x8_SAVE + +DGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x8_SUB2 + +DGEMM_L1x8_SAVE: + + SAVE1x8 + +DGEMM_L1x8_END: + +DGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble DGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x4_SUB4 + +DGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble DGEMM_L1x4_LOOP_END + + .align 5 + +DGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt DGEMM_L1x4_LOOP + +DGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b DGEMM_L1x4_SUB1 + +DGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b DGEMM_L1x4_SUB1 + +DGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x4_SAVE + b DGEMM_L1x4_SUB2 + +DGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x4_SAVE + +DGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x4_SUB2 + +DGEMM_L1x4_SAVE: + + SAVE1x4 + +DGEMM_L1x4_END: + +DGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble DGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x2_SUB4 + +DGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble DGEMM_L1x2_LOOP_END + + .align 5 + +DGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt DGEMM_L1x2_LOOP + +DGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b DGEMM_L1x2_SUB1 + +DGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b DGEMM_L1x2_SUB1 + +DGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x2_SAVE + b DGEMM_L1x2_SUB2 + +DGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x2_SAVE + +DGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x2_SUB2 + +DGEMM_L1x2_SAVE: + + SAVE1x2 + +DGEMM_L1x2_END: + +DGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble DGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x1_SUB4 + +DGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble DGEMM_L1x1_LOOP_END + + .align 5 + +DGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt DGEMM_L1x1_LOOP + +DGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b DGEMM_L1x1_SUB1 + +DGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b DGEMM_L1x1_SUB1 + +DGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x1_SAVE + b DGEMM_L1x1_SUB2 + +DGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x1_SAVE + +DGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x1_SUB2 + +DGEMM_L1x1_SAVE: + + SAVE1x1 + +DGEMM_L1x1_END: + +DGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S new file mode 100644 index 000000000..d4090985b --- /dev/null +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -0,0 +1,3400 @@ +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ + +.macro LOAD4x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + +.endm + +.macro KERNEL4x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.endm + +.macro KERNEL4x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endm + +.macro SAVE4x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + dcbt T1, PRE + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + dcbt T1, PRE + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r + xvmaddadp vs4, vs52, alpha_r + xvmaddadp vs5, vs53, alpha_r + xvmaddadp vs6, vs54, alpha_r + xvmaddadp vs7, vs55, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r + xvmuldp vs4, vs52, alpha_r + xvmuldp vs5, vs53, alpha_r + xvmuldp vs6, vs54, alpha_r + xvmuldp vs7, vs55, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + dcbt T1, PRE + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r + xvmaddadp vs12, vs60, alpha_r + xvmaddadp vs13, vs61, alpha_r + xvmaddadp vs14, vs62, alpha_r + xvmaddadp vs15, vs63, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r + xvmuldp vs12, vs60, alpha_r + xvmuldp vs13, vs61, alpha_r + xvmuldp vs14, vs62, alpha_r + xvmuldp vs15, vs63, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + dcbt T1, PRE + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + +.endm + +.macro KERNEL4x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.endm + +.macro KERNEL4x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S new file mode 100644 index 000000000..c892c65d3 --- /dev/null +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs18 + +#define o0 0 + +#define K1 r13 +#define KKK r14 +#define o8 r15 +#define o24 r16 +#define ALPHA r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_16x4_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfd f1, ALPHA_SP + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + mr KK, OFFSET +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, KK +#endif + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + li PRE, 256 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + lxvdsx alpha_r, 0, ALPHA + +#include "dtrmm_logic_16x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S new file mode 100644 index 000000000..f2886f8d6 --- /dev/null +++ b/kernel/power/dtrmm_logic_16x4_power8.S @@ -0,0 +1,2202 @@ + srawi. J, N, 2 + ble DTRMM_L4_END + +DTRMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble DTRMM_L4x16_END + +DTRMM_L4x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x16_SUB4 + +DTRMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble DTRMM_L4x16_LOOP_END + + .align 5 + +DTRMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt DTRMM_L4x16_LOOP + +DTRMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b DTRMM_L4x16_SUB1 + +DTRMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b DTRMM_L4x16_SUB1 + +DTRMM_L4x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x16_SAVE + b DTRMM_L4x16_SUB2 + +DTRMM_L4x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x16_SAVE + +DTRMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x16_SUB2 + +DTRMM_L4x16_SAVE: + + SAVE4x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt DTRMM_L4x16_BEGIN + +DTRMM_L4x16_END: + +DTRMM_L4x8_BEGIN: + andi. T2, M, 15 + ble DTRMM_L4x1_END + + andi. T1, M, 8 + ble DTRMM_L4x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x8_SUB4 + +DTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble DTRMM_L4x8_LOOP_END + + .align 5 + +DTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt DTRMM_L4x8_LOOP + +DTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b DTRMM_L4x8_SUB1 + +DTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b DTRMM_L4x8_SUB1 + +DTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x8_SAVE + b DTRMM_L4x8_SUB2 + +DTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x8_SAVE + +DTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x8_SUB2 + +DTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +DTRMM_L4x8_END: + +DTRMM_L4x4_BEGIN: + + andi. T1, M, 4 + ble DTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x4_SUB4 + +DTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble DTRMM_L4x4_LOOP_END + + .align 5 + +DTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt DTRMM_L4x4_LOOP + +DTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b DTRMM_L4x4_SUB1 + +DTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b DTRMM_L4x4_SUB1 + +DTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x4_SAVE + b DTRMM_L4x4_SUB2 + +DTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x4_SAVE + +DTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x4_SUB2 + +DTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +DTRMM_L4x4_END: + +DTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble DTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x2_SUB4 + +DTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble DTRMM_L4x2_LOOP_END + + .align 5 + +DTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt DTRMM_L4x2_LOOP + +DTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b DTRMM_L4x2_SUB1 + +DTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b DTRMM_L4x2_SUB1 + +DTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x2_SAVE + b DTRMM_L4x2_SUB2 + +DTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x2_SAVE + +DTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x2_SUB2 + +DTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +DTRMM_L4x2_END: + +DTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble DTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x1_SUB4 + +DTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble DTRMM_L4x1_LOOP_END + + .align 5 + +DTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt DTRMM_L4x1_LOOP + +DTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b DTRMM_L4x1_SUB1 + +DTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b DTRMM_L4x1_SUB1 + +DTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x1_SAVE + b DTRMM_L4x1_SUB2 + +DTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x1_SAVE + +DTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x1_SUB2 + +DTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +DTRMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt DTRMM_L4_BEGIN + + andi. T2, N, 3 + ble L999 + +DTRMM_L4_END: + + b DTRMM_L2_BEGIN + +L999_H1: + + b L999 + +DTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble DTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble DTRMM_L2x16_END + +DTRMM_L2x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x16_SUB4 + +DTRMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble DTRMM_L2x16_LOOP_END + + .align 5 + +DTRMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt DTRMM_L2x16_LOOP + +DTRMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b DTRMM_L2x16_SUB1 + +DTRMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b DTRMM_L2x16_SUB1 + +DTRMM_L2x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x16_SAVE + b DTRMM_L2x16_SUB2 + +DTRMM_L2x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x16_SAVE + +DTRMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x16_SUB2 + +DTRMM_L2x16_SAVE: + + SAVE2x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt DTRMM_L2x16_BEGIN + +DTRMM_L2x16_END: + +DTRMM_L2x8_BEGIN: + andi. T2, M, 15 + ble DTRMM_L2x1_END + + andi. T1, M, 8 + ble DTRMM_L2x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x8_SUB4 + +DTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble DTRMM_L2x8_LOOP_END + + .align 5 + +DTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt DTRMM_L2x8_LOOP + +DTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b DTRMM_L2x8_SUB1 + +DTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b DTRMM_L2x8_SUB1 + +DTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x8_SAVE + b DTRMM_L2x8_SUB2 + +DTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x8_SAVE + +DTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x8_SUB2 + +DTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +DTRMM_L2x8_END: + +DTRMM_L2x4_BEGIN: + + andi. T1, M, 4 + ble DTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x4_SUB4 + +DTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble DTRMM_L2x4_LOOP_END + + .align 5 + +DTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt DTRMM_L2x4_LOOP + +DTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b DTRMM_L2x4_SUB1 + +DTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b DTRMM_L2x4_SUB1 + +DTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x4_SAVE + b DTRMM_L2x4_SUB2 + +DTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x4_SAVE + +DTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x4_SUB2 + +DTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +DTRMM_L2x4_END: + +DTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble DTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x2_SUB4 + +DTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble DTRMM_L2x2_LOOP_END + + .align 5 + +DTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt DTRMM_L2x2_LOOP + +DTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b DTRMM_L2x2_SUB1 + +DTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b DTRMM_L2x2_SUB1 + +DTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x2_SAVE + b DTRMM_L2x2_SUB2 + +DTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x2_SAVE + +DTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x2_SUB2 + +DTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +DTRMM_L2x2_END: + +DTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble DTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x1_SUB4 + +DTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble DTRMM_L2x1_LOOP_END + + .align 5 + +DTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt DTRMM_L2x1_LOOP + +DTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b DTRMM_L2x1_SUB1 + +DTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b DTRMM_L2x1_SUB1 + +DTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x1_SAVE + b DTRMM_L2x1_SUB2 + +DTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x1_SAVE + +DTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x1_SUB2 + +DTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +DTRMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +DTRMM_L2_END: +DTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble DTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble DTRMM_L1x16_END + +DTRMM_L1x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x16_SUB4 + +DTRMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble DTRMM_L1x16_LOOP_END + + .align 5 + +DTRMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt DTRMM_L1x16_LOOP + +DTRMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b DTRMM_L1x16_SUB1 + +DTRMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b DTRMM_L1x16_SUB1 + +DTRMM_L1x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x16_SAVE + b DTRMM_L1x16_SUB2 + +DTRMM_L1x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x16_SAVE + +DTRMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x16_SUB2 + +DTRMM_L1x16_SAVE: + + SAVE1x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt DTRMM_L1x16_BEGIN + +DTRMM_L1x16_END: + +DTRMM_L1x8_BEGIN: + andi. T2, M, 15 + ble DTRMM_L1x1_END + + andi. T1, M, 8 + ble DTRMM_L1x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x8_SUB4 + +DTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble DTRMM_L1x8_LOOP_END + + .align 5 + +DTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt DTRMM_L1x8_LOOP + +DTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b DTRMM_L1x8_SUB1 + +DTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b DTRMM_L1x8_SUB1 + +DTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x8_SAVE + b DTRMM_L1x8_SUB2 + +DTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x8_SAVE + +DTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x8_SUB2 + +DTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +DTRMM_L1x8_END: + +DTRMM_L1x4_BEGIN: + + andi. T1, M, 4 + ble DTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x4_SUB4 + +DTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble DTRMM_L1x4_LOOP_END + + .align 5 + +DTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt DTRMM_L1x4_LOOP + +DTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b DTRMM_L1x4_SUB1 + +DTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b DTRMM_L1x4_SUB1 + +DTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x4_SAVE + b DTRMM_L1x4_SUB2 + +DTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x4_SAVE + +DTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x4_SUB2 + +DTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +DTRMM_L1x4_END: + +DTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble DTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x2_SUB4 + +DTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble DTRMM_L1x2_LOOP_END + + .align 5 + +DTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt DTRMM_L1x2_LOOP + +DTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b DTRMM_L1x2_SUB1 + +DTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b DTRMM_L1x2_SUB1 + +DTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x2_SAVE + b DTRMM_L1x2_SUB2 + +DTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x2_SAVE + +DTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x2_SUB2 + +DTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +DTRMM_L1x2_END: + +DTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble DTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x1_SUB4 + +DTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble DTRMM_L1x1_LOOP_END + + .align 5 + +DTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt DTRMM_L1x1_LOOP + +DTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b DTRMM_L1x1_SUB1 + +DTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b DTRMM_L1x1_SUB1 + +DTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x1_SAVE + b DTRMM_L1x1_SUB2 + +DTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x1_SAVE + +DTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x1_SUB2 + +DTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +DTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +DTRMM_L1_END: diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S index d7cfe5e97..c6e69b4fc 100644 --- a/kernel/power/gemm_ncopy_4.S +++ b/kernel/power/gemm_ncopy_4.S @@ -104,12 +104,12 @@ #define PREFETCHWSIZE 72 #endif -#ifdef POWER8 +#ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif -#ifdef PPCG4 +#ifdef POWER8 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif @@ -198,7 +198,7 @@ LL(12): STFD c12, 14 * SIZE(B) STFD c16, 15 * SIZE(B) -#ifdef POWER6 +#if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S index 46b1cd941..30513447e 100644 --- a/kernel/power/gemm_tcopy_4.S +++ b/kernel/power/gemm_tcopy_4.S @@ -108,12 +108,12 @@ #define PREFETCHWSIZE 48 #endif -#ifdef POWER8 +#ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif -#ifdef PPCG4 +#ifdef POWER8 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif @@ -229,7 +229,7 @@ LL(12): STFD c15, 14 * SIZE(B1) STFD c16, 15 * SIZE(B1) -#ifdef POWER6 +#if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 5c46c43e2..77587ecb1 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -174,11 +174,6 @@ #define PREFETCHSIZE_C 40 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 96 -#define PREFETCHSIZE_C 40 -#endif - #ifndef NEEDPARAM #ifndef __64BIT__ diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 457753065..817a60b86 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -139,11 +139,6 @@ #define PREFETCHSIZE_C 8 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 96 -#define PREFETCHSIZE_C 8 -#endif - #define y01 f0 #define y02 f1 #define y03 f2 diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index 9f759c3f6..f7d768c50 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -168,11 +168,7 @@ #define PREFETCHSIZE_A 40 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 40 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index e4e419baf..d8e082397 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -167,11 +167,7 @@ #define PREFETCHSIZE_A 40 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 40 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S new file mode 100644 index 000000000..03957f406 --- /dev/null +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define L r15 +#define ALPHA r16 +#define o24 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + stfd f1, ALPHA_R_SP + stfd f2, ALPHA_I_SP + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "zgemm_macros_8x2_power8.S" + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o8 , 8 + li o16 , 16 + li o24 , 24 + li o32 , 32 + li o48 , 48 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + lxvdsx alpha_r, 0, ALPHA + lxvdsx alpha_i, o8, ALPHA + + .align 5 + +#include "zgemm_logic_8x2_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S new file mode 100644 index 000000000..e829fd68e --- /dev/null +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -0,0 +1,901 @@ + srawi. J, N, 1 + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble ZGEMM_L2x8_END + +ZGEMM_L2x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x8_SUB4 + +ZGEMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + dcbt AO, PRE + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble ZGEMM_L2x8_LOOP_END + + .align 5 + +ZGEMM_L2x8_LOOP: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt ZGEMM_L2x8_LOOP + +ZGEMM_L2x8_LOOP_END: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + KERNEL2x8_E2 + + b ZGEMM_L2x8_SUB1 + +ZGEMM_L2x8_SUB4: + + dcbt AO, PRE + KERNEL2x8_SUBI1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b ZGEMM_L2x8_SUB1 + +ZGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_END: + +ZGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x4_SUB4 + +ZGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble ZGEMM_L2x4_LOOP_END + + .align 5 + +ZGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt ZGEMM_L2x4_LOOP + +ZGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b ZGEMM_L2x4_SUB1 + +ZGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b ZGEMM_L2x4_SUB1 + +ZGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + +ZGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x4_SAVE + +ZGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x4_SUB2 + +ZGEMM_L2x4_SAVE: + + SAVE2x4 + +ZGEMM_L2x4_END: + +ZGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x2_SUB4 + +ZGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble ZGEMM_L2x2_LOOP_END + + .align 5 + +ZGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt ZGEMM_L2x2_LOOP + +ZGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b ZGEMM_L2x2_SUB1 + +ZGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b ZGEMM_L2x2_SUB1 + +ZGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + +ZGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x2_SAVE + +ZGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x2_SUB2 + +ZGEMM_L2x2_SAVE: + + SAVE2x2 + +ZGEMM_L2x2_END: + +ZGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x1_SUB4 + +ZGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble ZGEMM_L2x1_LOOP_END + + .align 5 + +ZGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt ZGEMM_L2x1_LOOP + +ZGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b ZGEMM_L2x1_SUB1 + +ZGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b ZGEMM_L2x1_SUB1 + +ZGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + +ZGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x1_SAVE + +ZGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x1_SUB2 + +ZGEMM_L2x1_SAVE: + + SAVE2x1 + +ZGEMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt ZGEMM_L2_BEGIN + + andi. T2, N, 1 + ble L999 + +ZGEMM_L2_END: + + b ZGEMM_L1_BEGIN + +L999_H1: + + b L999 + +ZGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble ZGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 3 + ble ZGEMM_L1x8_END + +ZGEMM_L1x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x8_SUB4 + +ZGEMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + dcbt AO, PRE + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble ZGEMM_L1x8_LOOP_END + + .align 5 + +ZGEMM_L1x8_LOOP: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt ZGEMM_L1x8_LOOP + +ZGEMM_L1x8_LOOP_END: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + KERNEL1x8_E2 + + b ZGEMM_L1x8_SUB1 + +ZGEMM_L1x8_SUB4: + + dcbt AO, PRE + KERNEL1x8_SUBI1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b ZGEMM_L1x8_SUB1 + +ZGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + +ZGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x8_SAVE + +ZGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x8_SUB2 + +ZGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt ZGEMM_L1x8_BEGIN + +ZGEMM_L1x8_END: + +ZGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L1x1_END + + andi. T1, M, 4 + ble ZGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x4_SUB4 + +ZGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble ZGEMM_L1x4_LOOP_END + + .align 5 + +ZGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt ZGEMM_L1x4_LOOP + +ZGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b ZGEMM_L1x4_SUB1 + +ZGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b ZGEMM_L1x4_SUB1 + +ZGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + +ZGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x4_SAVE + +ZGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x4_SUB2 + +ZGEMM_L1x4_SAVE: + + SAVE1x4 + +ZGEMM_L1x4_END: + +ZGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x2_SUB4 + +ZGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble ZGEMM_L1x2_LOOP_END + + .align 5 + +ZGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt ZGEMM_L1x2_LOOP + +ZGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b ZGEMM_L1x2_SUB1 + +ZGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b ZGEMM_L1x2_SUB1 + +ZGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + +ZGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x2_SAVE + +ZGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x2_SUB2 + +ZGEMM_L1x2_SAVE: + + SAVE1x2 + +ZGEMM_L1x2_END: + +ZGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x1_SUB4 + +ZGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble ZGEMM_L1x1_LOOP_END + + .align 5 + +ZGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt ZGEMM_L1x1_LOOP + +ZGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b ZGEMM_L1x1_SUB1 + +ZGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b ZGEMM_L1x1_SUB1 + +ZGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + +ZGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x1_SAVE + +ZGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x1_SUB2 + +ZGEMM_L1x1_SAVE: + + SAVE1x1 + +ZGEMM_L1x1_END: + +ZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S new file mode 100644 index 000000000..3e5ea9ce8 --- /dev/null +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -0,0 +1,3074 @@ +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_1 + + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs48 // realA*realB + XSFADD_R2 vs0, vs0, vs49 // imagA*imagB + + xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs48 // realA*imagB + XSFADD_I2 vs1, vs1, vs49 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs50 // realA*realB + XSFADD_R2 vs0, vs0, vs51 // imagA*imagB + + xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs50 // realA*imagB + XSFADD_I2 vs1, vs1, vs51 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs52 // realA*realB + XSFADD_R2 vs0, vs0, vs53 // imagA*imagB + + xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs52 // realA*imagB + XSFADD_I2 vs1, vs1, vs53 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs54 // realA*realB + XSFADD_R2 vs0, vs0, vs55 // imagA*imagB + + xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs54 // realA*imagB + XSFADD_I2 vs1, vs1, vs55 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs56 // realA*realB + XSFADD_R2 vs0, vs0, vs57 // imagA*imagB + + xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs56 // realA*imagB + XSFADD_I2 vs1, vs1, vs57 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs58 // realA*realB + XSFADD_R2 vs0, vs0, vs59 // imagA*imagB + + xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs58 // realA*imagB + XSFADD_I2 vs1, vs1, vs59 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs60 // realA*realB + XSFADD_R2 vs0, vs0, vs61 // imagA*imagB + + xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs60 // realA*imagB + XSFADD_I2 vs1, vs1, vs61 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs62 // realA*realB + XSFADD_R2 vs0, vs0, vs63 // imagA*imagB + + xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs62 // realA*imagB + XSFADD_I2 vs1, vs1, vs63 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL2x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL1x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index f93439986..23e0177c0 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -170,11 +170,6 @@ #define PREFETCHSIZE_C 24 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 24 -#define PREFETCHSIZE_C 24 -#endif - #ifndef XCONJ #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index 2b4501434..c0bad3152 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -144,11 +144,6 @@ #define PREFETCHSIZE_C 8 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 24 -#define PREFETCHSIZE_C 8 -#endif - #if !(defined(CONJ) && defined(XCONJ)) #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index 394c030fa..b348e328f 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -169,11 +169,7 @@ #define PREFETCHSIZE_A 112 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 112 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index a061cd77b..b631cbe35 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -166,11 +166,7 @@ #define PREFETCHSIZE_A 112 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 112 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S new file mode 100644 index 000000000..dbbc8f9ac --- /dev/null +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -0,0 +1,342 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define KKK r13 +#define K1 r14 +#define L r15 +#define ALPHA r16 +#define o24 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfd f1, ALPHA_R_SP + stfd f2, ALPHA_I_SP + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "zgemm_macros_8x2_power8.S" + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o8 , 8 + li o16 , 16 + li o24 , 24 + li o32 , 32 + li o48 , 48 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + lxsdx alpha_r, 0, ALPHA + lxsdx alpha_i, o8, ALPHA + + .align 4 + +#include "ztrmm_logic_8x2_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S new file mode 100644 index 000000000..e250dfac5 --- /dev/null +++ b/kernel/power/ztrmm_logic_8x2_power8.S @@ -0,0 +1,1201 @@ + srawi. J, N, 1 + ble ZTRMM_L2_END + +ZTRMM_L2_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble ZTRMM_L2x8_END + +ZTRMM_L2x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x8_SUB4 + +ZTRMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + dcbt AO, PRE + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble ZTRMM_L2x8_LOOP_END + + .align 5 + +ZTRMM_L2x8_LOOP: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt ZTRMM_L2x8_LOOP + +ZTRMM_L2x8_LOOP_END: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + KERNEL2x8_E2 + + b ZTRMM_L2x8_SUB1 + +ZTRMM_L2x8_SUB4: + + dcbt AO, PRE + KERNEL2x8_SUBI1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b ZTRMM_L2x8_SUB1 + +ZTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x8_SAVE + b ZTRMM_L2x8_SUB2 + +ZTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x8_SAVE + +ZTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x8_SUB2 + +ZTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt ZTRMM_L2x8_BEGIN + +ZTRMM_L2x8_END: + +ZTRMM_L2x4_BEGIN: + andi. T2, M, 7 + ble ZTRMM_L2x1_END + + andi. T1, M, 4 + ble ZTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x4_SUB4 + +ZTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble ZTRMM_L2x4_LOOP_END + + .align 5 + +ZTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt ZTRMM_L2x4_LOOP + +ZTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b ZTRMM_L2x4_SUB1 + +ZTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b ZTRMM_L2x4_SUB1 + +ZTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x4_SAVE + b ZTRMM_L2x4_SUB2 + +ZTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x4_SAVE + +ZTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x4_SUB2 + +ZTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +ZTRMM_L2x4_END: + +ZTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble ZTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x2_SUB4 + +ZTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble ZTRMM_L2x2_LOOP_END + + .align 5 + +ZTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt ZTRMM_L2x2_LOOP + +ZTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b ZTRMM_L2x2_SUB1 + +ZTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b ZTRMM_L2x2_SUB1 + +ZTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x2_SAVE + b ZTRMM_L2x2_SUB2 + +ZTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x2_SAVE + +ZTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x2_SUB2 + +ZTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +ZTRMM_L2x2_END: + +ZTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble ZTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x1_SUB4 + +ZTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble ZTRMM_L2x1_LOOP_END + + .align 5 + +ZTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt ZTRMM_L2x1_LOOP + +ZTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b ZTRMM_L2x1_SUB1 + +ZTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b ZTRMM_L2x1_SUB1 + +ZTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x1_SAVE + b ZTRMM_L2x1_SUB2 + +ZTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x1_SAVE + +ZTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x1_SUB2 + +ZTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +ZTRMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt ZTRMM_L2_BEGIN + + andi. T2, N, 1 + ble L999 + +ZTRMM_L2_END: + + b ZTRMM_L1_BEGIN + +L999_H1: + + b L999 + +ZTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble ZTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble ZTRMM_L1x8_END + +ZTRMM_L1x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x8_SUB4 + +ZTRMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + dcbt AO, PRE + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble ZTRMM_L1x8_LOOP_END + + .align 5 + +ZTRMM_L1x8_LOOP: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt ZTRMM_L1x8_LOOP + +ZTRMM_L1x8_LOOP_END: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + KERNEL1x8_E2 + + b ZTRMM_L1x8_SUB1 + +ZTRMM_L1x8_SUB4: + + dcbt AO, PRE + KERNEL1x8_SUBI1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b ZTRMM_L1x8_SUB1 + +ZTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x8_SAVE + b ZTRMM_L1x8_SUB2 + +ZTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x8_SAVE + +ZTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x8_SUB2 + +ZTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt ZTRMM_L1x8_BEGIN + +ZTRMM_L1x8_END: + +ZTRMM_L1x4_BEGIN: + andi. T2, M, 7 + ble ZTRMM_L1x1_END + + andi. T1, M, 4 + ble ZTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x4_SUB4 + +ZTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble ZTRMM_L1x4_LOOP_END + + .align 5 + +ZTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt ZTRMM_L1x4_LOOP + +ZTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b ZTRMM_L1x4_SUB1 + +ZTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b ZTRMM_L1x4_SUB1 + +ZTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x4_SAVE + b ZTRMM_L1x4_SUB2 + +ZTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x4_SAVE + +ZTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x4_SUB2 + +ZTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +ZTRMM_L1x4_END: + +ZTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble ZTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x2_SUB4 + +ZTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble ZTRMM_L1x2_LOOP_END + + .align 5 + +ZTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt ZTRMM_L1x2_LOOP + +ZTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b ZTRMM_L1x2_SUB1 + +ZTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b ZTRMM_L1x2_SUB1 + +ZTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x2_SAVE + b ZTRMM_L1x2_SUB2 + +ZTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x2_SAVE + +ZTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x2_SUB2 + +ZTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +ZTRMM_L1x2_END: + +ZTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble ZTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x1_SUB4 + +ZTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble ZTRMM_L1x1_LOOP_END + + .align 5 + +ZTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt ZTRMM_L1x1_LOOP + +ZTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b ZTRMM_L1x1_SUB1 + +ZTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b ZTRMM_L1x1_SUB1 + +ZTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x1_SAVE + b ZTRMM_L1x1_SUB2 + +ZTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x1_SAVE + +ZTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x1_SUB2 + +ZTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +ZTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +ZTRMM_L1_END: diff --git a/param.h b/param.h index c46a1e999..e7dca2c0d 100644 --- a/param.h +++ b/param.h @@ -1962,35 +1962,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) #define SNUMOPT 4 -#define DNUMOPT 4 +#define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 992 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 488 -#define ZGEMM_DEFAULT_P 248 +#define ZGEMM_DEFAULT_P 240 #define SGEMM_DEFAULT_Q 504 -#define DGEMM_DEFAULT_Q 504 +#define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 400 -#define ZGEMM_DEFAULT_Q 400 +#define ZGEMM_DEFAULT_Q 360 + +#define DGEMM_DEFAULT_R 14400 +#define ZGEMM_DEFAULT_R 7200 #define SYMV_P 8 #endif + #if defined(SPARC) && defined(V7) #define SNUMOPT 4 From 1edf30b7906323d047386651053edec43c9c4454 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 1 Mar 2016 20:13:08 +0800 Subject: [PATCH 07/37] Change Opteron(SSE3) to Opteron_SSE3 at dyanmaic core name. --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a2b7c7045..2fde07fcc 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -391,7 +391,7 @@ static char *corename[] = { "Nehalem", "Athlon", "Opteron", - "Opteron(SSE3)", + "Opteron_SSE3", "Barcelona", "Nano", "Sandybridge", From 8577be2a9583ccbafaf1d0178f4370d84bd6d3d5 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 4 Mar 2016 04:24:43 +0800 Subject: [PATCH 08/37] Modify travis script. --- .travis.yml | 1 + Makefile | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 990bed864..63b469716 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,7 @@ before_install: - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi script: + - set -e - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi diff --git a/Makefile b/Makefile index 5aa10b2c3..9ba2bffb3 100644 --- a/Makefile +++ b/Makefile @@ -83,20 +83,20 @@ shared : ifndef NO_SHARED ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) @$(MAKE) -C exports so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + @ln -fs $(LIBSONAME) $(LIBPREFIX).so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) @$(MAKE) -C exports so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) @$(MAKE) -C exports so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn - @-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll From cc26d888b82bbbf5a20b43c23d5c11347566650f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 4 Mar 2016 10:26:53 +0100 Subject: [PATCH 09/37] BUGFIX: increased BUFFER_SIZE for POWER8 --- common_power.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_power.h b/common_power.h index 64e052f3d..052d38828 100644 --- a/common_power.h +++ b/common_power.h @@ -797,6 +797,8 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) +#elif defined(POWER8) +#define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) #endif From 4824b88fcb1110299bfa428a11e86b2ff5c22532 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 4 Mar 2016 12:35:25 +0100 Subject: [PATCH 10/37] enabled all level1 assembly kernels for power8 --- kernel/power/KERNEL | 16 ++++ kernel/power/KERNEL.POWER8 | 152 ++++++++++++++++++------------------- 2 files changed, 92 insertions(+), 76 deletions(-) diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index eae60cdcc..565d1fdb0 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -30,3 +30,19 @@ ifndef CTRSMKERNEL_RT CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif + +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif + +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif + +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 3a627e441..5ec5e41a7 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,7 +1,7 @@ -SGEMM_BETA = ../generic/gemm_beta.c -DGEMM_BETA = ../generic/gemm_beta.c -CGEMM_BETA = ../generic/zgemm_beta.c -ZGEMM_BETA = ../generic/zgemm_beta.c +#SGEMM_BETA = ../generic/gemm_beta.c +#DGEMM_BETA = ../generic/gemm_beta.c +#CGEMM_BETA = ../generic/zgemm_beta.c +#ZGEMM_BETA = ../generic/zgemm_beta.c STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = dtrmm_kernel_16x4_power8.S @@ -65,78 +65,78 @@ CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #Pure C for other kernels -SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c -CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c - -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMAXKERNEL = ../arm/iamax.c -IDAMAXKERNEL = ../arm/iamax.c -ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = ../arm/izamax.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -SASUMKERNEL = ../arm/asum.c -DASUMKERNEL = ../arm/asum.c -CASUMKERNEL = ../arm/zasum.c -ZASUMKERNEL = ../arm/zasum.c - -SAXPYKERNEL = ../arm/axpy.c -DAXPYKERNEL = ../arm/axpy.c -CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c - -SCOPYKERNEL = ../arm/copy.c -DCOPYKERNEL = ../arm/copy.c -CCOPYKERNEL = ../arm/zcopy.c -ZCOPYKERNEL = ../arm/zcopy.c - -SDOTKERNEL = ../arm/dot.c -DDOTKERNEL = ../arm/dot.c -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c - -SNRM2KERNEL = ../arm/nrm2.c -DNRM2KERNEL = ../arm/nrm2.c -CNRM2KERNEL = ../arm/znrm2.c -ZNRM2KERNEL = ../arm/znrm2.c - -SROTKERNEL = ../arm/rot.c -DROTKERNEL = ../arm/rot.c -CROTKERNEL = ../arm/zrot.c -ZROTKERNEL = ../arm/zrot.c - -SSCALKERNEL = ../arm/scal.c -DSCALKERNEL = ../arm/scal.c -CSCALKERNEL = ../arm/zscal.c -ZSCALKERNEL = ../arm/zscal.c - -SSWAPKERNEL = ../arm/swap.c -DSWAPKERNEL = ../arm/swap.c -CSWAPKERNEL = ../arm/zswap.c -ZSWAPKERNEL = ../arm/zswap.c - +#SAMAXKERNEL = ../arm/amax.c +#DAMAXKERNEL = ../arm/amax.c +#CAMAXKERNEL = ../arm/zamax.c +#ZAMAXKERNEL = ../arm/zamax.c +# +#SAMINKERNEL = ../arm/amin.c +#DAMINKERNEL = ../arm/amin.c +#CAMINKERNEL = ../arm/zamin.c +#ZAMINKERNEL = ../arm/zamin.c +# +#SMAXKERNEL = ../arm/max.c +#DMAXKERNEL = ../arm/max.c +# +#SMINKERNEL = ../arm/min.c +#DMINKERNEL = ../arm/min.c +# +#ISAMAXKERNEL = ../arm/iamax.c +#IDAMAXKERNEL = ../arm/iamax.c +#ICAMAXKERNEL = ../arm/izamax.c +#IZAMAXKERNEL = ../arm/izamax.c +# +#ISAMINKERNEL = ../arm/iamin.c +#IDAMINKERNEL = ../arm/iamin.c +#ICAMINKERNEL = ../arm/izamin.c +#IZAMINKERNEL = ../arm/izamin.c +# +#ISMAXKERNEL = ../arm/imax.c +#IDMAXKERNEL = ../arm/imax.c +# +#ISMINKERNEL = ../arm/imin.c +#IDMINKERNEL = ../arm/imin.c +# +#SASUMKERNEL = ../arm/asum.c +#DASUMKERNEL = ../arm/asum.c +#CASUMKERNEL = ../arm/zasum.c +#ZASUMKERNEL = ../arm/zasum.c +# +#SAXPYKERNEL = ../arm/axpy.c +#DAXPYKERNEL = ../arm/axpy.c +#CAXPYKERNEL = ../arm/zaxpy.c +#ZAXPYKERNEL = ../arm/zaxpy.c +# +#SCOPYKERNEL = ../arm/copy.c +#DCOPYKERNEL = ../arm/copy.c +#CCOPYKERNEL = ../arm/zcopy.c +#ZCOPYKERNEL = ../arm/zcopy.c +# +#SDOTKERNEL = ../arm/dot.c +#DDOTKERNEL = ../arm/dot.c +#CDOTKERNEL = ../arm/zdot.c +#ZDOTKERNEL = ../arm/zdot.c +# +#SNRM2KERNEL = ../arm/nrm2.c +#DNRM2KERNEL = ../arm/nrm2.c +#CNRM2KERNEL = ../arm/znrm2.c +#ZNRM2KERNEL = ../arm/znrm2.c +# +#SROTKERNEL = ../arm/rot.c +#DROTKERNEL = ../arm/rot.c +#CROTKERNEL = ../arm/zrot.c +#ZROTKERNEL = ../arm/zrot.c +# +#SSCALKERNEL = ../arm/scal.c +#DSCALKERNEL = ../arm/scal.c +#CSCALKERNEL = ../arm/zscal.c +#ZSCALKERNEL = ../arm/zscal.c +# +#SSWAPKERNEL = ../arm/swap.c +#DSWAPKERNEL = ../arm/swap.c +#CSWAPKERNEL = ../arm/zswap.c +#ZSWAPKERNEL = ../arm/zswap.c +# SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c From d5130ce7e35a96ef0fc2aa5c8a25d4dec269939b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 4 Mar 2016 12:53:31 +0100 Subject: [PATCH 11/37] enabled gemv assembly on power8 --- kernel/power/KERNEL.POWER8 | 17 +++++++++-------- kernel/power/gemv_n.S | 6 ++++++ kernel/power/gemv_t.S | 5 +++++ kernel/power/zgemv_n.S | 5 +++++ kernel/power/zgemv_t.S | 6 ++++++ 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 5ec5e41a7..c2e965bb7 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -137,15 +137,16 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #CSWAPKERNEL = ../arm/zswap.c #ZSWAPKERNEL = ../arm/zswap.c # -SGEMVNKERNEL = ../arm/gemv_n.c -DGEMVNKERNEL = ../arm/gemv_n.c -CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -DGEMVTKERNEL = ../arm/gemv_t.c -CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +#SGEMVNKERNEL = ../arm/gemv_n.c +#DGEMVNKERNEL = ../arm/gemv_n.c +#CGEMVNKERNEL = ../arm/zgemv_n.c +#ZGEMVNKERNEL = ../arm/zgemv_n.c +# +#SGEMVTKERNEL = ../arm/gemv_t.c +#DGEMVTKERNEL = ../arm/gemv_t.c +#CGEMVTKERNEL = ../arm/zgemv_t.c +#ZGEMVTKERNEL = ../arm/zgemv_t.c SSYMV_U_KERNEL = ../generic/symv_k.c SSYMV_L_KERNEL = ../generic/symv_k.c diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 77587ecb1..02160bd61 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -174,6 +174,12 @@ #define PREFETCHSIZE_C 40 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 40 +#endif + + #ifndef NEEDPARAM #ifndef __64BIT__ diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 817a60b86..457753065 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -139,6 +139,11 @@ #define PREFETCHSIZE_C 8 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 8 +#endif + #define y01 f0 #define y02 f1 #define y03 f2 diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 23e0177c0..f93439986 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -170,6 +170,11 @@ #define PREFETCHSIZE_C 24 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 24 +#endif + #ifndef XCONJ #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index c0bad3152..9c6f510c2 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -144,6 +144,12 @@ #define PREFETCHSIZE_C 8 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 8 +#endif + + #if !(defined(CONJ) && defined(XCONJ)) #define FMADDR FMADD #define FMSUBR FNMSUB From 3e633152c6beb477cd161d1c1655982fa058ce9a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 4 Mar 2016 13:08:18 +0100 Subject: [PATCH 12/37] enabled symv assembly kernels on power8 --- kernel/power/KERNEL.POWER8 | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c2e965bb7..0fea3e4d1 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -148,18 +148,19 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #CGEMVTKERNEL = ../arm/zgemv_t.c #ZGEMVTKERNEL = ../arm/zgemv_t.c -SSYMV_U_KERNEL = ../generic/symv_k.c -SSYMV_L_KERNEL = ../generic/symv_k.c -DSYMV_U_KERNEL = ../generic/symv_k.c -DSYMV_L_KERNEL = ../generic/symv_k.c -QSYMV_U_KERNEL = ../generic/symv_k.c -QSYMV_L_KERNEL = ../generic/symv_k.c -CSYMV_U_KERNEL = ../generic/zsymv_k.c -CSYMV_L_KERNEL = ../generic/zsymv_k.c -ZSYMV_U_KERNEL = ../generic/zsymv_k.c -ZSYMV_L_KERNEL = ../generic/zsymv_k.c -XSYMV_U_KERNEL = ../generic/zsymv_k.c -XSYMV_L_KERNEL = ../generic/zsymv_k.c + +#SSYMV_U_KERNEL = ../generic/symv_k.c +#SSYMV_L_KERNEL = ../generic/symv_k.c +#DSYMV_U_KERNEL = ../generic/symv_k.c +#DSYMV_L_KERNEL = ../generic/symv_k.c +#QSYMV_U_KERNEL = ../generic/symv_k.c +#QSYMV_L_KERNEL = ../generic/symv_k.c +#CSYMV_U_KERNEL = ../generic/zsymv_k.c +#CSYMV_L_KERNEL = ../generic/zsymv_k.c +#ZSYMV_U_KERNEL = ../generic/zsymv_k.c +#ZSYMV_L_KERNEL = ../generic/zsymv_k.c +#XSYMV_U_KERNEL = ../generic/zsymv_k.c +#XSYMV_L_KERNEL = ../generic/zsymv_k.c ZHEMV_U_KERNEL = ../generic/zhemv_k.c ZHEMV_L_KERNEL = ../generic/zhemv_k.c From 73f04c2c726cadf09b0b4db2c8bd4787efa1a154 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 4 Mar 2016 13:20:50 +0100 Subject: [PATCH 13/37] enabled hemv assemly function for power8 --- kernel/power/KERNEL.POWER8 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 0fea3e4d1..7a83cd66f 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -61,8 +61,8 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. -CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S -ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S +#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #Pure C for other kernels #SAMAXKERNEL = ../arm/amax.c @@ -162,8 +162,8 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #XSYMV_U_KERNEL = ../generic/zsymv_k.c #XSYMV_L_KERNEL = ../generic/zsymv_k.c -ZHEMV_U_KERNEL = ../generic/zhemv_k.c -ZHEMV_L_KERNEL = ../generic/zhemv_k.c +#ZHEMV_U_KERNEL = ../generic/zhemv_k.c +#ZHEMV_L_KERNEL = ../generic/zhemv_k.c LSAME_KERNEL = ../generic/lsame.c SCABS_KERNEL = ../generic/cabs.c From 91e1c5080cd8a89cbcc6b9511ca326206987dbfc Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 4 Mar 2016 13:38:57 +0100 Subject: [PATCH 14/37] modified configuration, to use power6 sgemm kernel for power8 --- kernel/power/KERNEL.POWER8 | 8 ++++---- param.h | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 7a83cd66f..760d568cd 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -3,14 +3,14 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c +STRMMKERNEL = gemm_kernel_power6.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o diff --git a/param.h b/param.h index e7dca2c0d..31125d8e4 100644 --- a/param.h +++ b/param.h @@ -1968,8 +1968,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 @@ -1987,6 +1987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 400 #define ZGEMM_DEFAULT_Q 360 +#define SGEMM_DEFAULT_R 28800 #define DGEMM_DEFAULT_R 14400 #define ZGEMM_DEFAULT_R 7200 From 0afc76fd652dd65611dfbf9e26c102e016f9dffb Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 4 Mar 2016 15:01:15 +0100 Subject: [PATCH 15/37] enabled gemm_beta assembly kernels --- kernel/power/KERNEL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index 565d1fdb0..3ec0aaa58 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -31,18 +31,18 @@ CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif ifndef SGEMM_BETA -SGEMM_BETA = ../generic/gemm_beta.c +SGEMM_BETA = gemm_beta.S endif ifndef DGEMM_BETA -DGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = gemm_beta.S endif ifndef CGEMM_BETA -CGEMM_BETA = ../generic/zgemm_beta.c +CGEMM_BETA = zgemm_beta.S endif ifndef ZGEMM_BETA -ZGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = zgemm_beta.S endif From 8f758eeff9a659534f7655376ec3e013763a6b2e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 5 Mar 2016 08:32:03 +0800 Subject: [PATCH 16/37] Refs #786. avoid old assembly c/zgemv kernels. --- ctest/cin2 | 2 +- ctest/cin3 | 2 +- ctest/cin3_3m | 2 +- ctest/din2 | 2 +- ctest/din3 | 2 +- ctest/sin2 | 2 +- ctest/sin3 | 2 +- ctest/zin2 | 2 +- ctest/zin3 | 2 +- ctest/zin3_3m | 2 +- kernel/x86_64/KERNEL | 8 ++++---- kernel/x86_64/KERNEL.BARCELONA | 3 --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- 13 files changed, 15 insertions(+), 18 deletions(-) diff --git a/ctest/cin2 b/ctest/cin2 index 032fcbb39..b2e1e4a0e 100644 --- a/ctest/cin2 +++ b/ctest/cin2 @@ -1,7 +1,7 @@ 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/cin3 b/ctest/cin3 index 223d165db..fbdb57857 100644 --- a/ctest/cin3 +++ b/ctest/cin3 @@ -1,7 +1,7 @@ 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/cin3_3m b/ctest/cin3_3m index 34014143e..5a797291a 100644 --- a/ctest/cin3_3m +++ b/ctest/cin3_3m @@ -1,7 +1,7 @@ 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/din2 b/ctest/din2 index 6f42b2792..df8f7b6ae 100644 --- a/ctest/din2 +++ b/ctest/din2 @@ -1,7 +1,7 @@ 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/din3 b/ctest/din3 index cbbcc22ab..23fedfe32 100644 --- a/ctest/din3 +++ b/ctest/din3 @@ -1,7 +1,7 @@ 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/sin2 b/ctest/sin2 index 3eee5c2f9..0e1ecd9d6 100644 --- a/ctest/sin2 +++ b/ctest/sin2 @@ -1,7 +1,7 @@ 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/sin3 b/ctest/sin3 index 01e32d6ee..644083f22 100644 --- a/ctest/sin3 +++ b/ctest/sin3 @@ -1,7 +1,7 @@ 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/zin2 b/ctest/zin2 index 4c0affe92..217697191 100644 --- a/ctest/zin2 +++ b/ctest/zin2 @@ -1,7 +1,7 @@ 'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/zin3 b/ctest/zin3 index 70050b693..ee269e8d5 100644 --- a/ctest/zin3 +++ b/ctest/zin3 @@ -1,7 +1,7 @@ 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/zin3_3m b/ctest/zin3_3m index 33bf08353..a0d4fde0a 100644 --- a/ctest/zin3_3m +++ b/ctest/zin3_3m @@ -1,7 +1,7 @@ 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 56850f791..4874711bb 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -389,19 +389,19 @@ DGEMVTKERNEL = dgemv_t.S endif ifndef CGEMVNKERNEL -CGEMVNKERNEL = cgemv_n.S +CGEMVNKERNEL = cgemv_n_4.c endif ifndef CGEMVTKERNEL -CGEMVTKERNEL = ../arm/zgemv_t.c +CGEMVTKERNEL = cgemv_t_4.c endif ifndef ZGEMVNKERNEL -ZGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n_4.c endif ifndef ZGEMVTKERNEL -ZGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t_4.c endif ifndef QGEMVNKERNEL diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA index 313c62d7c..70f3d6058 100644 --- a/kernel/x86_64/KERNEL.BARCELONA +++ b/kernel/x86_64/KERNEL.BARCELONA @@ -1,6 +1,3 @@ -ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.S - SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index c8ccae1ea..90834d9ca 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S From 085f215257c21b0d9724a623ab50b6c8fe568ca3 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 5 Mar 2016 10:27:27 +0100 Subject: [PATCH 17/37] Modified assembly label name, so that they are hidden. Added license informations. --- kernel/power/dgemm_kernel_16x4_power8.S | 43 +- kernel/power/dgemm_logic_16x4_power8.S | 708 ++++++++++++----------- kernel/power/dgemm_macros_16x4_power8.S | 35 ++ kernel/power/dtrmm_kernel_16x4_power8.S | 43 +- kernel/power/dtrmm_logic_16x4_power8.S | 709 +++++++++++++----------- kernel/power/zgemm_kernel_8x2_power8.S | 43 +- kernel/power/zgemm_logic_8x2_power8.S | 366 ++++++------ kernel/power/zgemm_macros_8x2_power8.S | 36 ++ kernel/power/ztrmm_kernel_8x2_power8.S | 43 +- kernel/power/ztrmm_logic_8x2_power8.S | 404 ++++++++------ 10 files changed, 1375 insertions(+), 1055 deletions(-) diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 53205ade8..c67f31160 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -218,11 +253,11 @@ cmpwi cr0, M, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, N, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, K, 0 - ble L999_H1 + ble .L999_H1 #ifdef __64BIT__ addi ALPHA, SP, 296 @@ -241,7 +276,7 @@ #include "dgemm_logic_16x4_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index e19f78b8d..49c438f61 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -1,25 +1,61 @@ - srawi. J, N, 2 - ble DGEMM_L4_END +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ -DGEMM_L4_BEGIN: +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 2 + ble .LDGEMM_L4_END + +.LDGEMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 - ble DGEMM_L4x16_END + ble .LDGEMM_L4x16_END -DGEMM_L4x16_BEGIN: +.LDGEMM_L4x16_BEGIN: mr BO, B srawi. L, K, 3 - ble DGEMM_L4x16_SUB0 + ble .LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x16_SUB4 + ble .LDGEMM_L4x16_SUB4 -DGEMM_L4x16_LOOP_START: +.LDGEMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -42,11 +78,11 @@ DGEMM_L4x16_LOOP_START: KERNEL4x16_2 addic. L, L, -2 - ble DGEMM_L4x16_LOOP_END + ble .LDGEMM_L4x16_LOOP_END .align 5 -DGEMM_L4x16_LOOP: +.LDGEMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 @@ -67,9 +103,9 @@ DGEMM_L4x16_LOOP: KERNEL4x16_2 addic. L, L, -1 - bgt DGEMM_L4x16_LOOP + bgt .LDGEMM_L4x16_LOOP -DGEMM_L4x16_LOOP_END: +.LDGEMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 @@ -88,9 +124,9 @@ DGEMM_L4x16_LOOP_END: KERNEL4x16_1 KERNEL4x16_E2 - b DGEMM_L4x16_SUB1 + b .LDGEMM_L4x16_SUB1 -DGEMM_L4x16_SUB4: +.LDGEMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -106,53 +142,53 @@ DGEMM_L4x16_SUB4: KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b DGEMM_L4x16_SUB1 + b .LDGEMM_L4x16_SUB1 -DGEMM_L4x16_SUB0: +.LDGEMM_L4x16_SUB0: andi. L, K, 7 KERNEL4x16_SUBI1 addic. L, L, -1 - ble DGEMM_L4x16_SAVE - b DGEMM_L4x16_SUB2 + ble .LDGEMM_L4x16_SAVE + b .LDGEMM_L4x16_SUB2 -DGEMM_L4x16_SUB1: +.LDGEMM_L4x16_SUB1: andi. L, K, 7 - ble DGEMM_L4x16_SAVE + ble .LDGEMM_L4x16_SAVE -DGEMM_L4x16_SUB2: +.LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt DGEMM_L4x16_SUB2 + bgt .LDGEMM_L4x16_SUB2 -DGEMM_L4x16_SAVE: +.LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt DGEMM_L4x16_BEGIN + bgt .LDGEMM_L4x16_BEGIN -DGEMM_L4x16_END: +.LDGEMM_L4x16_END: -DGEMM_L4x8_BEGIN: +.LDGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble DGEMM_L4x1_END + ble .LDGEMM_L4x1_END andi. T1, M, 8 - ble DGEMM_L4x8_END + ble .LDGEMM_L4x8_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x8_SUB0 + ble .LDGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x8_SUB4 + ble .LDGEMM_L4x8_SUB4 -DGEMM_L4x8_LOOP_START: +.LDGEMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -166,11 +202,11 @@ DGEMM_L4x8_LOOP_START: KERNEL4x8_2 addic. L, L, -2 - ble DGEMM_L4x8_LOOP_END + ble .LDGEMM_L4x8_LOOP_END .align 5 -DGEMM_L4x8_LOOP: +.LDGEMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -183,9 +219,9 @@ DGEMM_L4x8_LOOP: KERNEL4x8_2 addic. L, L, -1 - bgt DGEMM_L4x8_LOOP + bgt .LDGEMM_L4x8_LOOP -DGEMM_L4x8_LOOP_END: +.LDGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -197,9 +233,9 @@ DGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_E2 - b DGEMM_L4x8_SUB1 + b .LDGEMM_L4x8_SUB1 -DGEMM_L4x8_SUB4: +.LDGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -211,48 +247,48 @@ DGEMM_L4x8_SUB4: KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b DGEMM_L4x8_SUB1 + b .LDGEMM_L4x8_SUB1 -DGEMM_L4x8_SUB0: +.LDGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble DGEMM_L4x8_SAVE - b DGEMM_L4x8_SUB2 + ble .LDGEMM_L4x8_SAVE + b .LDGEMM_L4x8_SUB2 -DGEMM_L4x8_SUB1: +.LDGEMM_L4x8_SUB1: andi. L, K, 7 - ble DGEMM_L4x8_SAVE + ble .LDGEMM_L4x8_SAVE -DGEMM_L4x8_SUB2: +.LDGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt DGEMM_L4x8_SUB2 + bgt .LDGEMM_L4x8_SUB2 -DGEMM_L4x8_SAVE: +.LDGEMM_L4x8_SAVE: SAVE4x8 -DGEMM_L4x8_END: +.LDGEMM_L4x8_END: -DGEMM_L4x4_BEGIN: +.LDGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble DGEMM_L4x4_END + ble .LDGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x4_SUB0 + ble .LDGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x4_SUB4 + ble .LDGEMM_L4x4_SUB4 -DGEMM_L4x4_LOOP_START: +.LDGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -266,11 +302,11 @@ DGEMM_L4x4_LOOP_START: KERNEL4x4_2 addic. L, L, -2 - ble DGEMM_L4x4_LOOP_END + ble .LDGEMM_L4x4_LOOP_END .align 5 -DGEMM_L4x4_LOOP: +.LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -283,9 +319,9 @@ DGEMM_L4x4_LOOP: KERNEL4x4_2 addic. L, L, -1 - bgt DGEMM_L4x4_LOOP + bgt .LDGEMM_L4x4_LOOP -DGEMM_L4x4_LOOP_END: +.LDGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -297,9 +333,9 @@ DGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_E2 - b DGEMM_L4x4_SUB1 + b .LDGEMM_L4x4_SUB1 -DGEMM_L4x4_SUB4: +.LDGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -311,48 +347,48 @@ DGEMM_L4x4_SUB4: KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b DGEMM_L4x4_SUB1 + b .LDGEMM_L4x4_SUB1 -DGEMM_L4x4_SUB0: +.LDGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble DGEMM_L4x4_SAVE - b DGEMM_L4x4_SUB2 + ble .LDGEMM_L4x4_SAVE + b .LDGEMM_L4x4_SUB2 -DGEMM_L4x4_SUB1: +.LDGEMM_L4x4_SUB1: andi. L, K, 7 - ble DGEMM_L4x4_SAVE + ble .LDGEMM_L4x4_SAVE -DGEMM_L4x4_SUB2: +.LDGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt DGEMM_L4x4_SUB2 + bgt .LDGEMM_L4x4_SUB2 -DGEMM_L4x4_SAVE: +.LDGEMM_L4x4_SAVE: SAVE4x4 -DGEMM_L4x4_END: +.LDGEMM_L4x4_END: -DGEMM_L4x2_BEGIN: +.LDGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble DGEMM_L4x2_END + ble .LDGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x2_SUB0 + ble .LDGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x2_SUB4 + ble .LDGEMM_L4x2_SUB4 -DGEMM_L4x2_LOOP_START: +.LDGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -366,11 +402,11 @@ DGEMM_L4x2_LOOP_START: KERNEL4x2_2 addic. L, L, -2 - ble DGEMM_L4x2_LOOP_END + ble .LDGEMM_L4x2_LOOP_END .align 5 -DGEMM_L4x2_LOOP: +.LDGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -383,9 +419,9 @@ DGEMM_L4x2_LOOP: KERNEL4x2_2 addic. L, L, -1 - bgt DGEMM_L4x2_LOOP + bgt .LDGEMM_L4x2_LOOP -DGEMM_L4x2_LOOP_END: +.LDGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -397,9 +433,9 @@ DGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_E2 - b DGEMM_L4x2_SUB1 + b .LDGEMM_L4x2_SUB1 -DGEMM_L4x2_SUB4: +.LDGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -411,48 +447,48 @@ DGEMM_L4x2_SUB4: KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b DGEMM_L4x2_SUB1 + b .LDGEMM_L4x2_SUB1 -DGEMM_L4x2_SUB0: +.LDGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble DGEMM_L4x2_SAVE - b DGEMM_L4x2_SUB2 + ble .LDGEMM_L4x2_SAVE + b .LDGEMM_L4x2_SUB2 -DGEMM_L4x2_SUB1: +.LDGEMM_L4x2_SUB1: andi. L, K, 7 - ble DGEMM_L4x2_SAVE + ble .LDGEMM_L4x2_SAVE -DGEMM_L4x2_SUB2: +.LDGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt DGEMM_L4x2_SUB2 + bgt .LDGEMM_L4x2_SUB2 -DGEMM_L4x2_SAVE: +.LDGEMM_L4x2_SAVE: SAVE4x2 -DGEMM_L4x2_END: +.LDGEMM_L4x2_END: -DGEMM_L4x1_BEGIN: +.LDGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble DGEMM_L4x1_END + ble .LDGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x1_SUB0 + ble .LDGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x1_SUB4 + ble .LDGEMM_L4x1_SUB4 -DGEMM_L4x1_LOOP_START: +.LDGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -466,11 +502,11 @@ DGEMM_L4x1_LOOP_START: KERNEL4x1_2 addic. L, L, -2 - ble DGEMM_L4x1_LOOP_END + ble .LDGEMM_L4x1_LOOP_END .align 5 -DGEMM_L4x1_LOOP: +.LDGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -483,9 +519,9 @@ DGEMM_L4x1_LOOP: KERNEL4x1_2 addic. L, L, -1 - bgt DGEMM_L4x1_LOOP + bgt .LDGEMM_L4x1_LOOP -DGEMM_L4x1_LOOP_END: +.LDGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -497,9 +533,9 @@ DGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_E2 - b DGEMM_L4x1_SUB1 + b .LDGEMM_L4x1_SUB1 -DGEMM_L4x1_SUB4: +.LDGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -511,74 +547,74 @@ DGEMM_L4x1_SUB4: KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b DGEMM_L4x1_SUB1 + b .LDGEMM_L4x1_SUB1 -DGEMM_L4x1_SUB0: +.LDGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble DGEMM_L4x1_SAVE - b DGEMM_L4x1_SUB2 + ble .LDGEMM_L4x1_SAVE + b .LDGEMM_L4x1_SUB2 -DGEMM_L4x1_SUB1: +.LDGEMM_L4x1_SUB1: andi. L, K, 7 - ble DGEMM_L4x1_SAVE + ble .LDGEMM_L4x1_SAVE -DGEMM_L4x1_SUB2: +.LDGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt DGEMM_L4x1_SUB2 + bgt .LDGEMM_L4x1_SUB2 -DGEMM_L4x1_SAVE: +.LDGEMM_L4x1_SAVE: SAVE4x1 -DGEMM_L4x1_END: +.LDGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt DGEMM_L4_BEGIN + bgt .LDGEMM_L4_BEGIN andi. T2, N, 3 - ble L999 + ble .L999 -DGEMM_L4_END: +.LDGEMM_L4_END: - b DGEMM_L2_BEGIN + b .LDGEMM_L2_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -DGEMM_L2_BEGIN: +.LDGEMM_L2_BEGIN: andi. T1, N, 2 - ble DGEMM_L2_END + ble .LDGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble DGEMM_L2x16_END + ble .LDGEMM_L2x16_END -DGEMM_L2x16_BEGIN: +.LDGEMM_L2x16_BEGIN: mr BO, B srawi. L, K, 3 - ble DGEMM_L2x16_SUB0 + ble .LDGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x16_SUB4 + ble .LDGEMM_L2x16_SUB4 -DGEMM_L2x16_LOOP_START: +.LDGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -601,11 +637,11 @@ DGEMM_L2x16_LOOP_START: KERNEL2x16_2 addic. L, L, -2 - ble DGEMM_L2x16_LOOP_END + ble .LDGEMM_L2x16_LOOP_END .align 5 -DGEMM_L2x16_LOOP: +.LDGEMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -626,9 +662,9 @@ DGEMM_L2x16_LOOP: KERNEL2x16_2 addic. L, L, -1 - bgt DGEMM_L2x16_LOOP + bgt .LDGEMM_L2x16_LOOP -DGEMM_L2x16_LOOP_END: +.LDGEMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -647,9 +683,9 @@ DGEMM_L2x16_LOOP_END: KERNEL2x16_1 KERNEL2x16_E2 - b DGEMM_L2x16_SUB1 + b .LDGEMM_L2x16_SUB1 -DGEMM_L2x16_SUB4: +.LDGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -665,53 +701,53 @@ DGEMM_L2x16_SUB4: KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b DGEMM_L2x16_SUB1 + b .LDGEMM_L2x16_SUB1 -DGEMM_L2x16_SUB0: +.LDGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble DGEMM_L2x16_SAVE - b DGEMM_L2x16_SUB2 + ble .LDGEMM_L2x16_SAVE + b .LDGEMM_L2x16_SUB2 -DGEMM_L2x16_SUB1: +.LDGEMM_L2x16_SUB1: andi. L, K, 7 - ble DGEMM_L2x16_SAVE + ble .LDGEMM_L2x16_SAVE -DGEMM_L2x16_SUB2: +.LDGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt DGEMM_L2x16_SUB2 + bgt .LDGEMM_L2x16_SUB2 -DGEMM_L2x16_SAVE: +.LDGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt DGEMM_L2x16_BEGIN + bgt .LDGEMM_L2x16_BEGIN -DGEMM_L2x16_END: +.LDGEMM_L2x16_END: -DGEMM_L2x8_BEGIN: +.LDGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble DGEMM_L2x1_END + ble .LDGEMM_L2x1_END andi. T1, M, 8 - ble DGEMM_L2x8_END + ble .LDGEMM_L2x8_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x8_SUB0 + ble .LDGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x8_SUB4 + ble .LDGEMM_L2x8_SUB4 -DGEMM_L2x8_LOOP_START: +.LDGEMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -725,11 +761,11 @@ DGEMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble DGEMM_L2x8_LOOP_END + ble .LDGEMM_L2x8_LOOP_END .align 5 -DGEMM_L2x8_LOOP: +.LDGEMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -742,9 +778,9 @@ DGEMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt DGEMM_L2x8_LOOP + bgt .LDGEMM_L2x8_LOOP -DGEMM_L2x8_LOOP_END: +.LDGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -756,9 +792,9 @@ DGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b DGEMM_L2x8_SUB1 + b .LDGEMM_L2x8_SUB1 -DGEMM_L2x8_SUB4: +.LDGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -770,48 +806,48 @@ DGEMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b DGEMM_L2x8_SUB1 + b .LDGEMM_L2x8_SUB1 -DGEMM_L2x8_SUB0: +.LDGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble DGEMM_L2x8_SAVE - b DGEMM_L2x8_SUB2 + ble .LDGEMM_L2x8_SAVE + b .LDGEMM_L2x8_SUB2 -DGEMM_L2x8_SUB1: +.LDGEMM_L2x8_SUB1: andi. L, K, 7 - ble DGEMM_L2x8_SAVE + ble .LDGEMM_L2x8_SAVE -DGEMM_L2x8_SUB2: +.LDGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt DGEMM_L2x8_SUB2 + bgt .LDGEMM_L2x8_SUB2 -DGEMM_L2x8_SAVE: +.LDGEMM_L2x8_SAVE: SAVE2x8 -DGEMM_L2x8_END: +.LDGEMM_L2x8_END: -DGEMM_L2x4_BEGIN: +.LDGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble DGEMM_L2x4_END + ble .LDGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x4_SUB0 + ble .LDGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x4_SUB4 + ble .LDGEMM_L2x4_SUB4 -DGEMM_L2x4_LOOP_START: +.LDGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -825,11 +861,11 @@ DGEMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble DGEMM_L2x4_LOOP_END + ble .LDGEMM_L2x4_LOOP_END .align 5 -DGEMM_L2x4_LOOP: +.LDGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -842,9 +878,9 @@ DGEMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt DGEMM_L2x4_LOOP + bgt .LDGEMM_L2x4_LOOP -DGEMM_L2x4_LOOP_END: +.LDGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -856,9 +892,9 @@ DGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b DGEMM_L2x4_SUB1 + b .LDGEMM_L2x4_SUB1 -DGEMM_L2x4_SUB4: +.LDGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -870,48 +906,48 @@ DGEMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b DGEMM_L2x4_SUB1 + b .LDGEMM_L2x4_SUB1 -DGEMM_L2x4_SUB0: +.LDGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble DGEMM_L2x4_SAVE - b DGEMM_L2x4_SUB2 + ble .LDGEMM_L2x4_SAVE + b .LDGEMM_L2x4_SUB2 -DGEMM_L2x4_SUB1: +.LDGEMM_L2x4_SUB1: andi. L, K, 7 - ble DGEMM_L2x4_SAVE + ble .LDGEMM_L2x4_SAVE -DGEMM_L2x4_SUB2: +.LDGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt DGEMM_L2x4_SUB2 + bgt .LDGEMM_L2x4_SUB2 -DGEMM_L2x4_SAVE: +.LDGEMM_L2x4_SAVE: SAVE2x4 -DGEMM_L2x4_END: +.LDGEMM_L2x4_END: -DGEMM_L2x2_BEGIN: +.LDGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble DGEMM_L2x2_END + ble .LDGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x2_SUB0 + ble .LDGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x2_SUB4 + ble .LDGEMM_L2x2_SUB4 -DGEMM_L2x2_LOOP_START: +.LDGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -925,11 +961,11 @@ DGEMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble DGEMM_L2x2_LOOP_END + ble .LDGEMM_L2x2_LOOP_END .align 5 -DGEMM_L2x2_LOOP: +.LDGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -942,9 +978,9 @@ DGEMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt DGEMM_L2x2_LOOP + bgt .LDGEMM_L2x2_LOOP -DGEMM_L2x2_LOOP_END: +.LDGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -956,9 +992,9 @@ DGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b DGEMM_L2x2_SUB1 + b .LDGEMM_L2x2_SUB1 -DGEMM_L2x2_SUB4: +.LDGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -970,48 +1006,48 @@ DGEMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b DGEMM_L2x2_SUB1 + b .LDGEMM_L2x2_SUB1 -DGEMM_L2x2_SUB0: +.LDGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble DGEMM_L2x2_SAVE - b DGEMM_L2x2_SUB2 + ble .LDGEMM_L2x2_SAVE + b .LDGEMM_L2x2_SUB2 -DGEMM_L2x2_SUB1: +.LDGEMM_L2x2_SUB1: andi. L, K, 7 - ble DGEMM_L2x2_SAVE + ble .LDGEMM_L2x2_SAVE -DGEMM_L2x2_SUB2: +.LDGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt DGEMM_L2x2_SUB2 + bgt .LDGEMM_L2x2_SUB2 -DGEMM_L2x2_SAVE: +.LDGEMM_L2x2_SAVE: SAVE2x2 -DGEMM_L2x2_END: +.LDGEMM_L2x2_END: -DGEMM_L2x1_BEGIN: +.LDGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble DGEMM_L2x1_END + ble .LDGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x1_SUB0 + ble .LDGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x1_SUB4 + ble .LDGEMM_L2x1_SUB4 -DGEMM_L2x1_LOOP_START: +.LDGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1025,11 +1061,11 @@ DGEMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble DGEMM_L2x1_LOOP_END + ble .LDGEMM_L2x1_LOOP_END .align 5 -DGEMM_L2x1_LOOP: +.LDGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1042,9 +1078,9 @@ DGEMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt DGEMM_L2x1_LOOP + bgt .LDGEMM_L2x1_LOOP -DGEMM_L2x1_LOOP_END: +.LDGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1056,9 +1092,9 @@ DGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b DGEMM_L2x1_SUB1 + b .LDGEMM_L2x1_SUB1 -DGEMM_L2x1_SUB4: +.LDGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1070,59 +1106,59 @@ DGEMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b DGEMM_L2x1_SUB1 + b .LDGEMM_L2x1_SUB1 -DGEMM_L2x1_SUB0: +.LDGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble DGEMM_L2x1_SAVE - b DGEMM_L2x1_SUB2 + ble .LDGEMM_L2x1_SAVE + b .LDGEMM_L2x1_SUB2 -DGEMM_L2x1_SUB1: +.LDGEMM_L2x1_SUB1: andi. L, K, 7 - ble DGEMM_L2x1_SAVE + ble .LDGEMM_L2x1_SAVE -DGEMM_L2x1_SUB2: +.LDGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt DGEMM_L2x1_SUB2 + bgt .LDGEMM_L2x1_SUB2 -DGEMM_L2x1_SAVE: +.LDGEMM_L2x1_SAVE: SAVE2x1 -DGEMM_L2x1_END: +.LDGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -DGEMM_L2_END: -DGEMM_L1_BEGIN: +.LDGEMM_L2_END: +.LDGEMM_L1_BEGIN: andi. T1, N, 1 - ble DGEMM_L1_END + ble .LDGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble DGEMM_L1x16_END + ble .LDGEMM_L1x16_END -DGEMM_L1x16_BEGIN: +.LDGEMM_L1x16_BEGIN: mr BO, B srawi. L, K, 3 - ble DGEMM_L1x16_SUB0 + ble .LDGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x16_SUB4 + ble .LDGEMM_L1x16_SUB4 -DGEMM_L1x16_LOOP_START: +.LDGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1145,11 +1181,11 @@ DGEMM_L1x16_LOOP_START: KERNEL1x16_2 addic. L, L, -2 - ble DGEMM_L1x16_LOOP_END + ble .LDGEMM_L1x16_LOOP_END .align 5 -DGEMM_L1x16_LOOP: +.LDGEMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1170,9 +1206,9 @@ DGEMM_L1x16_LOOP: KERNEL1x16_2 addic. L, L, -1 - bgt DGEMM_L1x16_LOOP + bgt .LDGEMM_L1x16_LOOP -DGEMM_L1x16_LOOP_END: +.LDGEMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1191,9 +1227,9 @@ DGEMM_L1x16_LOOP_END: KERNEL1x16_1 KERNEL1x16_E2 - b DGEMM_L1x16_SUB1 + b .LDGEMM_L1x16_SUB1 -DGEMM_L1x16_SUB4: +.LDGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1209,53 +1245,53 @@ DGEMM_L1x16_SUB4: KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b DGEMM_L1x16_SUB1 + b .LDGEMM_L1x16_SUB1 -DGEMM_L1x16_SUB0: +.LDGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble DGEMM_L1x16_SAVE - b DGEMM_L1x16_SUB2 + ble .LDGEMM_L1x16_SAVE + b .LDGEMM_L1x16_SUB2 -DGEMM_L1x16_SUB1: +.LDGEMM_L1x16_SUB1: andi. L, K, 7 - ble DGEMM_L1x16_SAVE + ble .LDGEMM_L1x16_SAVE -DGEMM_L1x16_SUB2: +.LDGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt DGEMM_L1x16_SUB2 + bgt .LDGEMM_L1x16_SUB2 -DGEMM_L1x16_SAVE: +.LDGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt DGEMM_L1x16_BEGIN + bgt .LDGEMM_L1x16_BEGIN -DGEMM_L1x16_END: +.LDGEMM_L1x16_END: -DGEMM_L1x8_BEGIN: +.LDGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble DGEMM_L1x1_END + ble .LDGEMM_L1x1_END andi. T1, M, 8 - ble DGEMM_L1x8_END + ble .LDGEMM_L1x8_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x8_SUB0 + ble .LDGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x8_SUB4 + ble .LDGEMM_L1x8_SUB4 -DGEMM_L1x8_LOOP_START: +.LDGEMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1269,11 +1305,11 @@ DGEMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble DGEMM_L1x8_LOOP_END + ble .LDGEMM_L1x8_LOOP_END .align 5 -DGEMM_L1x8_LOOP: +.LDGEMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1286,9 +1322,9 @@ DGEMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt DGEMM_L1x8_LOOP + bgt .LDGEMM_L1x8_LOOP -DGEMM_L1x8_LOOP_END: +.LDGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1300,9 +1336,9 @@ DGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b DGEMM_L1x8_SUB1 + b .LDGEMM_L1x8_SUB1 -DGEMM_L1x8_SUB4: +.LDGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1314,48 +1350,48 @@ DGEMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b DGEMM_L1x8_SUB1 + b .LDGEMM_L1x8_SUB1 -DGEMM_L1x8_SUB0: +.LDGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble DGEMM_L1x8_SAVE - b DGEMM_L1x8_SUB2 + ble .LDGEMM_L1x8_SAVE + b .LDGEMM_L1x8_SUB2 -DGEMM_L1x8_SUB1: +.LDGEMM_L1x8_SUB1: andi. L, K, 7 - ble DGEMM_L1x8_SAVE + ble .LDGEMM_L1x8_SAVE -DGEMM_L1x8_SUB2: +.LDGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt DGEMM_L1x8_SUB2 + bgt .LDGEMM_L1x8_SUB2 -DGEMM_L1x8_SAVE: +.LDGEMM_L1x8_SAVE: SAVE1x8 -DGEMM_L1x8_END: +.LDGEMM_L1x8_END: -DGEMM_L1x4_BEGIN: +.LDGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble DGEMM_L1x4_END + ble .LDGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x4_SUB0 + ble .LDGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x4_SUB4 + ble .LDGEMM_L1x4_SUB4 -DGEMM_L1x4_LOOP_START: +.LDGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1369,11 +1405,11 @@ DGEMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble DGEMM_L1x4_LOOP_END + ble .LDGEMM_L1x4_LOOP_END .align 5 -DGEMM_L1x4_LOOP: +.LDGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1386,9 +1422,9 @@ DGEMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt DGEMM_L1x4_LOOP + bgt .LDGEMM_L1x4_LOOP -DGEMM_L1x4_LOOP_END: +.LDGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1400,9 +1436,9 @@ DGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b DGEMM_L1x4_SUB1 + b .LDGEMM_L1x4_SUB1 -DGEMM_L1x4_SUB4: +.LDGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1414,48 +1450,48 @@ DGEMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b DGEMM_L1x4_SUB1 + b .LDGEMM_L1x4_SUB1 -DGEMM_L1x4_SUB0: +.LDGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble DGEMM_L1x4_SAVE - b DGEMM_L1x4_SUB2 + ble .LDGEMM_L1x4_SAVE + b .LDGEMM_L1x4_SUB2 -DGEMM_L1x4_SUB1: +.LDGEMM_L1x4_SUB1: andi. L, K, 7 - ble DGEMM_L1x4_SAVE + ble .LDGEMM_L1x4_SAVE -DGEMM_L1x4_SUB2: +.LDGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt DGEMM_L1x4_SUB2 + bgt .LDGEMM_L1x4_SUB2 -DGEMM_L1x4_SAVE: +.LDGEMM_L1x4_SAVE: SAVE1x4 -DGEMM_L1x4_END: +.LDGEMM_L1x4_END: -DGEMM_L1x2_BEGIN: +.LDGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble DGEMM_L1x2_END + ble .LDGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x2_SUB0 + ble .LDGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x2_SUB4 + ble .LDGEMM_L1x2_SUB4 -DGEMM_L1x2_LOOP_START: +.LDGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1469,11 +1505,11 @@ DGEMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble DGEMM_L1x2_LOOP_END + ble .LDGEMM_L1x2_LOOP_END .align 5 -DGEMM_L1x2_LOOP: +.LDGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1486,9 +1522,9 @@ DGEMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt DGEMM_L1x2_LOOP + bgt .LDGEMM_L1x2_LOOP -DGEMM_L1x2_LOOP_END: +.LDGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1500,9 +1536,9 @@ DGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b DGEMM_L1x2_SUB1 + b .LDGEMM_L1x2_SUB1 -DGEMM_L1x2_SUB4: +.LDGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1514,48 +1550,48 @@ DGEMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b DGEMM_L1x2_SUB1 + b .LDGEMM_L1x2_SUB1 -DGEMM_L1x2_SUB0: +.LDGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble DGEMM_L1x2_SAVE - b DGEMM_L1x2_SUB2 + ble .LDGEMM_L1x2_SAVE + b .LDGEMM_L1x2_SUB2 -DGEMM_L1x2_SUB1: +.LDGEMM_L1x2_SUB1: andi. L, K, 7 - ble DGEMM_L1x2_SAVE + ble .LDGEMM_L1x2_SAVE -DGEMM_L1x2_SUB2: +.LDGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt DGEMM_L1x2_SUB2 + bgt .LDGEMM_L1x2_SUB2 -DGEMM_L1x2_SAVE: +.LDGEMM_L1x2_SAVE: SAVE1x2 -DGEMM_L1x2_END: +.LDGEMM_L1x2_END: -DGEMM_L1x1_BEGIN: +.LDGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble DGEMM_L1x1_END + ble .LDGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x1_SUB0 + ble .LDGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x1_SUB4 + ble .LDGEMM_L1x1_SUB4 -DGEMM_L1x1_LOOP_START: +.LDGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1569,11 +1605,11 @@ DGEMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble DGEMM_L1x1_LOOP_END + ble .LDGEMM_L1x1_LOOP_END .align 5 -DGEMM_L1x1_LOOP: +.LDGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1586,9 +1622,9 @@ DGEMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt DGEMM_L1x1_LOOP + bgt .LDGEMM_L1x1_LOOP -DGEMM_L1x1_LOOP_END: +.LDGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1600,9 +1636,9 @@ DGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b DGEMM_L1x1_SUB1 + b .LDGEMM_L1x1_SUB1 -DGEMM_L1x1_SUB4: +.LDGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1614,34 +1650,34 @@ DGEMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b DGEMM_L1x1_SUB1 + b .LDGEMM_L1x1_SUB1 -DGEMM_L1x1_SUB0: +.LDGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble DGEMM_L1x1_SAVE - b DGEMM_L1x1_SUB2 + ble .LDGEMM_L1x1_SAVE + b .LDGEMM_L1x1_SUB2 -DGEMM_L1x1_SUB1: +.LDGEMM_L1x1_SUB1: andi. L, K, 7 - ble DGEMM_L1x1_SAVE + ble .LDGEMM_L1x1_SAVE -DGEMM_L1x1_SUB2: +.LDGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt DGEMM_L1x1_SUB2 + bgt .LDGEMM_L1x1_SUB2 -DGEMM_L1x1_SAVE: +.LDGEMM_L1x1_SAVE: SAVE1x1 -DGEMM_L1x1_END: +.LDGEMM_L1x1_END: -DGEMM_L1_END: +.LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index d4090985b..27c05e08e 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /********************************************************************* * Macros for N=4, M=16 * *********************************************************************/ diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index c892c65d3..2294128a2 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -228,11 +263,11 @@ #endif cmpwi cr0, M, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, N, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, K, 0 - ble L999_H1 + ble .L999_H1 #ifdef __64BIT__ addi ALPHA, SP, 296 @@ -251,7 +286,7 @@ #include "dtrmm_logic_16x4_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S index f2886f8d6..a4340c598 100644 --- a/kernel/power/dtrmm_logic_16x4_power8.S +++ b/kernel/power/dtrmm_logic_16x4_power8.S @@ -1,7 +1,44 @@ - srawi. J, N, 2 - ble DTRMM_L4_END +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ -DTRMM_L4_BEGIN: +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + + srawi. J, N, 2 + ble .LDTRMM_L4_END + +.LDTRMM_L4_BEGIN: mr CO, C mr AO, A @@ -13,9 +50,9 @@ DTRMM_L4_BEGIN: #endif srawi. I, M, 4 - ble DTRMM_L4x16_END + ble .LDTRMM_L4x16_END -DTRMM_L4x16_BEGIN: +.LDTRMM_L4x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -42,11 +79,11 @@ DTRMM_L4x16_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x16_SUB0 + ble .LDTRMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x16_SUB4 + ble .LDTRMM_L4x16_SUB4 -DTRMM_L4x16_LOOP_START: +.LDTRMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -69,11 +106,11 @@ DTRMM_L4x16_LOOP_START: KERNEL4x16_2 addic. L, L, -2 - ble DTRMM_L4x16_LOOP_END + ble .LDTRMM_L4x16_LOOP_END .align 5 -DTRMM_L4x16_LOOP: +.LDTRMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 @@ -94,9 +131,9 @@ DTRMM_L4x16_LOOP: KERNEL4x16_2 addic. L, L, -1 - bgt DTRMM_L4x16_LOOP + bgt .LDTRMM_L4x16_LOOP -DTRMM_L4x16_LOOP_END: +.LDTRMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 @@ -115,9 +152,9 @@ DTRMM_L4x16_LOOP_END: KERNEL4x16_1 KERNEL4x16_E2 - b DTRMM_L4x16_SUB1 + b .LDTRMM_L4x16_SUB1 -DTRMM_L4x16_SUB4: +.LDTRMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -133,31 +170,31 @@ DTRMM_L4x16_SUB4: KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b DTRMM_L4x16_SUB1 + b .LDTRMM_L4x16_SUB1 -DTRMM_L4x16_SUB0: +.LDTRMM_L4x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x16_SUBI1 addic. L, L, -1 - ble DTRMM_L4x16_SAVE - b DTRMM_L4x16_SUB2 + ble .LDTRMM_L4x16_SAVE + b .LDTRMM_L4x16_SUB2 -DTRMM_L4x16_SUB1: +.LDTRMM_L4x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x16_SAVE + ble .LDTRMM_L4x16_SAVE -DTRMM_L4x16_SUB2: +.LDTRMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt DTRMM_L4x16_SUB2 + bgt .LDTRMM_L4x16_SUB2 -DTRMM_L4x16_SAVE: +.LDTRMM_L4x16_SAVE: SAVE4x16 @@ -175,16 +212,16 @@ DTRMM_L4x16_SAVE: addic. I, I, -1 - bgt DTRMM_L4x16_BEGIN + bgt .LDTRMM_L4x16_BEGIN -DTRMM_L4x16_END: +.LDTRMM_L4x16_END: -DTRMM_L4x8_BEGIN: +.LDTRMM_L4x8_BEGIN: andi. T2, M, 15 - ble DTRMM_L4x1_END + ble .LDTRMM_L4x1_END andi. T1, M, 8 - ble DTRMM_L4x8_END + ble .LDTRMM_L4x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -210,11 +247,11 @@ DTRMM_L4x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x8_SUB0 + ble .LDTRMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x8_SUB4 + ble .LDTRMM_L4x8_SUB4 -DTRMM_L4x8_LOOP_START: +.LDTRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -228,11 +265,11 @@ DTRMM_L4x8_LOOP_START: KERNEL4x8_2 addic. L, L, -2 - ble DTRMM_L4x8_LOOP_END + ble .LDTRMM_L4x8_LOOP_END .align 5 -DTRMM_L4x8_LOOP: +.LDTRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -245,9 +282,9 @@ DTRMM_L4x8_LOOP: KERNEL4x8_2 addic. L, L, -1 - bgt DTRMM_L4x8_LOOP + bgt .LDTRMM_L4x8_LOOP -DTRMM_L4x8_LOOP_END: +.LDTRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -259,9 +296,9 @@ DTRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_E2 - b DTRMM_L4x8_SUB1 + b .LDTRMM_L4x8_SUB1 -DTRMM_L4x8_SUB4: +.LDTRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -273,31 +310,31 @@ DTRMM_L4x8_SUB4: KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b DTRMM_L4x8_SUB1 + b .LDTRMM_L4x8_SUB1 -DTRMM_L4x8_SUB0: +.LDTRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 - ble DTRMM_L4x8_SAVE - b DTRMM_L4x8_SUB2 + ble .LDTRMM_L4x8_SAVE + b .LDTRMM_L4x8_SUB2 -DTRMM_L4x8_SUB1: +.LDTRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x8_SAVE + ble .LDTRMM_L4x8_SAVE -DTRMM_L4x8_SUB2: +.LDTRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt DTRMM_L4x8_SUB2 + bgt .LDTRMM_L4x8_SUB2 -DTRMM_L4x8_SAVE: +.LDTRMM_L4x8_SAVE: SAVE4x8 @@ -314,12 +351,12 @@ DTRMM_L4x8_SAVE: #endif -DTRMM_L4x8_END: +.LDTRMM_L4x8_END: -DTRMM_L4x4_BEGIN: +.LDTRMM_L4x4_BEGIN: andi. T1, M, 4 - ble DTRMM_L4x4_END + ble .LDTRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -345,11 +382,11 @@ DTRMM_L4x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x4_SUB0 + ble .LDTRMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x4_SUB4 + ble .LDTRMM_L4x4_SUB4 -DTRMM_L4x4_LOOP_START: +.LDTRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -363,11 +400,11 @@ DTRMM_L4x4_LOOP_START: KERNEL4x4_2 addic. L, L, -2 - ble DTRMM_L4x4_LOOP_END + ble .LDTRMM_L4x4_LOOP_END .align 5 -DTRMM_L4x4_LOOP: +.LDTRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -380,9 +417,9 @@ DTRMM_L4x4_LOOP: KERNEL4x4_2 addic. L, L, -1 - bgt DTRMM_L4x4_LOOP + bgt .LDTRMM_L4x4_LOOP -DTRMM_L4x4_LOOP_END: +.LDTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -394,9 +431,9 @@ DTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_E2 - b DTRMM_L4x4_SUB1 + b .LDTRMM_L4x4_SUB1 -DTRMM_L4x4_SUB4: +.LDTRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -408,31 +445,31 @@ DTRMM_L4x4_SUB4: KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b DTRMM_L4x4_SUB1 + b .LDTRMM_L4x4_SUB1 -DTRMM_L4x4_SUB0: +.LDTRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 - ble DTRMM_L4x4_SAVE - b DTRMM_L4x4_SUB2 + ble .LDTRMM_L4x4_SAVE + b .LDTRMM_L4x4_SUB2 -DTRMM_L4x4_SUB1: +.LDTRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x4_SAVE + ble .LDTRMM_L4x4_SAVE -DTRMM_L4x4_SUB2: +.LDTRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt DTRMM_L4x4_SUB2 + bgt .LDTRMM_L4x4_SUB2 -DTRMM_L4x4_SAVE: +.LDTRMM_L4x4_SAVE: SAVE4x4 @@ -449,12 +486,12 @@ DTRMM_L4x4_SAVE: #endif -DTRMM_L4x4_END: +.LDTRMM_L4x4_END: -DTRMM_L4x2_BEGIN: +.LDTRMM_L4x2_BEGIN: andi. T1, M, 2 - ble DTRMM_L4x2_END + ble .LDTRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -480,11 +517,11 @@ DTRMM_L4x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x2_SUB0 + ble .LDTRMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x2_SUB4 + ble .LDTRMM_L4x2_SUB4 -DTRMM_L4x2_LOOP_START: +.LDTRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -498,11 +535,11 @@ DTRMM_L4x2_LOOP_START: KERNEL4x2_2 addic. L, L, -2 - ble DTRMM_L4x2_LOOP_END + ble .LDTRMM_L4x2_LOOP_END .align 5 -DTRMM_L4x2_LOOP: +.LDTRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -515,9 +552,9 @@ DTRMM_L4x2_LOOP: KERNEL4x2_2 addic. L, L, -1 - bgt DTRMM_L4x2_LOOP + bgt .LDTRMM_L4x2_LOOP -DTRMM_L4x2_LOOP_END: +.LDTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -529,9 +566,9 @@ DTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_E2 - b DTRMM_L4x2_SUB1 + b .LDTRMM_L4x2_SUB1 -DTRMM_L4x2_SUB4: +.LDTRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -543,31 +580,31 @@ DTRMM_L4x2_SUB4: KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b DTRMM_L4x2_SUB1 + b .LDTRMM_L4x2_SUB1 -DTRMM_L4x2_SUB0: +.LDTRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 - ble DTRMM_L4x2_SAVE - b DTRMM_L4x2_SUB2 + ble .LDTRMM_L4x2_SAVE + b .LDTRMM_L4x2_SUB2 -DTRMM_L4x2_SUB1: +.LDTRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x2_SAVE + ble .LDTRMM_L4x2_SAVE -DTRMM_L4x2_SUB2: +.LDTRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt DTRMM_L4x2_SUB2 + bgt .LDTRMM_L4x2_SUB2 -DTRMM_L4x2_SAVE: +.LDTRMM_L4x2_SAVE: SAVE4x2 @@ -584,12 +621,12 @@ DTRMM_L4x2_SAVE: #endif -DTRMM_L4x2_END: +.LDTRMM_L4x2_END: -DTRMM_L4x1_BEGIN: +.LDTRMM_L4x1_BEGIN: andi. T1, M, 1 - ble DTRMM_L4x1_END + ble .LDTRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -615,11 +652,11 @@ DTRMM_L4x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x1_SUB0 + ble .LDTRMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x1_SUB4 + ble .LDTRMM_L4x1_SUB4 -DTRMM_L4x1_LOOP_START: +.LDTRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -633,11 +670,11 @@ DTRMM_L4x1_LOOP_START: KERNEL4x1_2 addic. L, L, -2 - ble DTRMM_L4x1_LOOP_END + ble .LDTRMM_L4x1_LOOP_END .align 5 -DTRMM_L4x1_LOOP: +.LDTRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -650,9 +687,9 @@ DTRMM_L4x1_LOOP: KERNEL4x1_2 addic. L, L, -1 - bgt DTRMM_L4x1_LOOP + bgt .LDTRMM_L4x1_LOOP -DTRMM_L4x1_LOOP_END: +.LDTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -664,9 +701,9 @@ DTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_E2 - b DTRMM_L4x1_SUB1 + b .LDTRMM_L4x1_SUB1 -DTRMM_L4x1_SUB4: +.LDTRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -678,31 +715,31 @@ DTRMM_L4x1_SUB4: KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b DTRMM_L4x1_SUB1 + b .LDTRMM_L4x1_SUB1 -DTRMM_L4x1_SUB0: +.LDTRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 - ble DTRMM_L4x1_SAVE - b DTRMM_L4x1_SUB2 + ble .LDTRMM_L4x1_SAVE + b .LDTRMM_L4x1_SUB2 -DTRMM_L4x1_SUB1: +.LDTRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x1_SAVE + ble .LDTRMM_L4x1_SAVE -DTRMM_L4x1_SUB2: +.LDTRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt DTRMM_L4x1_SUB2 + bgt .LDTRMM_L4x1_SUB2 -DTRMM_L4x1_SAVE: +.LDTRMM_L4x1_SAVE: SAVE4x1 @@ -719,7 +756,7 @@ DTRMM_L4x1_SAVE: #endif -DTRMM_L4x1_END: +.LDTRMM_L4x1_END: slwi T1, K, 5 add B, B, T1 @@ -730,23 +767,23 @@ DTRMM_L4x1_END: addic. J, J, -1 - bgt DTRMM_L4_BEGIN + bgt .LDTRMM_L4_BEGIN andi. T2, N, 3 - ble L999 + ble .L999 -DTRMM_L4_END: +.LDTRMM_L4_END: - b DTRMM_L2_BEGIN + b .LDTRMM_L2_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -DTRMM_L2_BEGIN: +.LDTRMM_L2_BEGIN: andi. T1, N, 2 - ble DTRMM_L2_END + ble .LDTRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 @@ -757,9 +794,9 @@ DTRMM_L2_BEGIN: #endif srawi. I, M, 4 - ble DTRMM_L2x16_END + ble .LDTRMM_L2x16_END -DTRMM_L2x16_BEGIN: +.LDTRMM_L2x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -786,11 +823,11 @@ DTRMM_L2x16_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x16_SUB0 + ble .LDTRMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x16_SUB4 + ble .LDTRMM_L2x16_SUB4 -DTRMM_L2x16_LOOP_START: +.LDTRMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -813,11 +850,11 @@ DTRMM_L2x16_LOOP_START: KERNEL2x16_2 addic. L, L, -2 - ble DTRMM_L2x16_LOOP_END + ble .LDTRMM_L2x16_LOOP_END .align 5 -DTRMM_L2x16_LOOP: +.LDTRMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -838,9 +875,9 @@ DTRMM_L2x16_LOOP: KERNEL2x16_2 addic. L, L, -1 - bgt DTRMM_L2x16_LOOP + bgt .LDTRMM_L2x16_LOOP -DTRMM_L2x16_LOOP_END: +.LDTRMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -859,9 +896,9 @@ DTRMM_L2x16_LOOP_END: KERNEL2x16_1 KERNEL2x16_E2 - b DTRMM_L2x16_SUB1 + b .LDTRMM_L2x16_SUB1 -DTRMM_L2x16_SUB4: +.LDTRMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -877,31 +914,31 @@ DTRMM_L2x16_SUB4: KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b DTRMM_L2x16_SUB1 + b .LDTRMM_L2x16_SUB1 -DTRMM_L2x16_SUB0: +.LDTRMM_L2x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x16_SUBI1 addic. L, L, -1 - ble DTRMM_L2x16_SAVE - b DTRMM_L2x16_SUB2 + ble .LDTRMM_L2x16_SAVE + b .LDTRMM_L2x16_SUB2 -DTRMM_L2x16_SUB1: +.LDTRMM_L2x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x16_SAVE + ble .LDTRMM_L2x16_SAVE -DTRMM_L2x16_SUB2: +.LDTRMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt DTRMM_L2x16_SUB2 + bgt .LDTRMM_L2x16_SUB2 -DTRMM_L2x16_SAVE: +.LDTRMM_L2x16_SAVE: SAVE2x16 @@ -919,16 +956,16 @@ DTRMM_L2x16_SAVE: addic. I, I, -1 - bgt DTRMM_L2x16_BEGIN + bgt .LDTRMM_L2x16_BEGIN -DTRMM_L2x16_END: +.LDTRMM_L2x16_END: -DTRMM_L2x8_BEGIN: +.LDTRMM_L2x8_BEGIN: andi. T2, M, 15 - ble DTRMM_L2x1_END + ble .LDTRMM_L2x1_END andi. T1, M, 8 - ble DTRMM_L2x8_END + ble .LDTRMM_L2x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -954,11 +991,11 @@ DTRMM_L2x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x8_SUB0 + ble .LDTRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x8_SUB4 + ble .LDTRMM_L2x8_SUB4 -DTRMM_L2x8_LOOP_START: +.LDTRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -972,11 +1009,11 @@ DTRMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble DTRMM_L2x8_LOOP_END + ble .LDTRMM_L2x8_LOOP_END .align 5 -DTRMM_L2x8_LOOP: +.LDTRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -989,9 +1026,9 @@ DTRMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt DTRMM_L2x8_LOOP + bgt .LDTRMM_L2x8_LOOP -DTRMM_L2x8_LOOP_END: +.LDTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -1003,9 +1040,9 @@ DTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b DTRMM_L2x8_SUB1 + b .LDTRMM_L2x8_SUB1 -DTRMM_L2x8_SUB4: +.LDTRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -1017,31 +1054,31 @@ DTRMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b DTRMM_L2x8_SUB1 + b .LDTRMM_L2x8_SUB1 -DTRMM_L2x8_SUB0: +.LDTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble DTRMM_L2x8_SAVE - b DTRMM_L2x8_SUB2 + ble .LDTRMM_L2x8_SAVE + b .LDTRMM_L2x8_SUB2 -DTRMM_L2x8_SUB1: +.LDTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x8_SAVE + ble .LDTRMM_L2x8_SAVE -DTRMM_L2x8_SUB2: +.LDTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt DTRMM_L2x8_SUB2 + bgt .LDTRMM_L2x8_SUB2 -DTRMM_L2x8_SAVE: +.LDTRMM_L2x8_SAVE: SAVE2x8 @@ -1058,12 +1095,12 @@ DTRMM_L2x8_SAVE: #endif -DTRMM_L2x8_END: +.LDTRMM_L2x8_END: -DTRMM_L2x4_BEGIN: +.LDTRMM_L2x4_BEGIN: andi. T1, M, 4 - ble DTRMM_L2x4_END + ble .LDTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1089,11 +1126,11 @@ DTRMM_L2x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x4_SUB0 + ble .LDTRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x4_SUB4 + ble .LDTRMM_L2x4_SUB4 -DTRMM_L2x4_LOOP_START: +.LDTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -1107,11 +1144,11 @@ DTRMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble DTRMM_L2x4_LOOP_END + ble .LDTRMM_L2x4_LOOP_END .align 5 -DTRMM_L2x4_LOOP: +.LDTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -1124,9 +1161,9 @@ DTRMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt DTRMM_L2x4_LOOP + bgt .LDTRMM_L2x4_LOOP -DTRMM_L2x4_LOOP_END: +.LDTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -1138,9 +1175,9 @@ DTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b DTRMM_L2x4_SUB1 + b .LDTRMM_L2x4_SUB1 -DTRMM_L2x4_SUB4: +.LDTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -1152,31 +1189,31 @@ DTRMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b DTRMM_L2x4_SUB1 + b .LDTRMM_L2x4_SUB1 -DTRMM_L2x4_SUB0: +.LDTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble DTRMM_L2x4_SAVE - b DTRMM_L2x4_SUB2 + ble .LDTRMM_L2x4_SAVE + b .LDTRMM_L2x4_SUB2 -DTRMM_L2x4_SUB1: +.LDTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x4_SAVE + ble .LDTRMM_L2x4_SAVE -DTRMM_L2x4_SUB2: +.LDTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt DTRMM_L2x4_SUB2 + bgt .LDTRMM_L2x4_SUB2 -DTRMM_L2x4_SAVE: +.LDTRMM_L2x4_SAVE: SAVE2x4 @@ -1193,12 +1230,12 @@ DTRMM_L2x4_SAVE: #endif -DTRMM_L2x4_END: +.LDTRMM_L2x4_END: -DTRMM_L2x2_BEGIN: +.LDTRMM_L2x2_BEGIN: andi. T1, M, 2 - ble DTRMM_L2x2_END + ble .LDTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1224,11 +1261,11 @@ DTRMM_L2x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x2_SUB0 + ble .LDTRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x2_SUB4 + ble .LDTRMM_L2x2_SUB4 -DTRMM_L2x2_LOOP_START: +.LDTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -1242,11 +1279,11 @@ DTRMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble DTRMM_L2x2_LOOP_END + ble .LDTRMM_L2x2_LOOP_END .align 5 -DTRMM_L2x2_LOOP: +.LDTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -1259,9 +1296,9 @@ DTRMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt DTRMM_L2x2_LOOP + bgt .LDTRMM_L2x2_LOOP -DTRMM_L2x2_LOOP_END: +.LDTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -1273,9 +1310,9 @@ DTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b DTRMM_L2x2_SUB1 + b .LDTRMM_L2x2_SUB1 -DTRMM_L2x2_SUB4: +.LDTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1287,31 +1324,31 @@ DTRMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b DTRMM_L2x2_SUB1 + b .LDTRMM_L2x2_SUB1 -DTRMM_L2x2_SUB0: +.LDTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble DTRMM_L2x2_SAVE - b DTRMM_L2x2_SUB2 + ble .LDTRMM_L2x2_SAVE + b .LDTRMM_L2x2_SUB2 -DTRMM_L2x2_SUB1: +.LDTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x2_SAVE + ble .LDTRMM_L2x2_SAVE -DTRMM_L2x2_SUB2: +.LDTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt DTRMM_L2x2_SUB2 + bgt .LDTRMM_L2x2_SUB2 -DTRMM_L2x2_SAVE: +.LDTRMM_L2x2_SAVE: SAVE2x2 @@ -1328,12 +1365,12 @@ DTRMM_L2x2_SAVE: #endif -DTRMM_L2x2_END: +.LDTRMM_L2x2_END: -DTRMM_L2x1_BEGIN: +.LDTRMM_L2x1_BEGIN: andi. T1, M, 1 - ble DTRMM_L2x1_END + ble .LDTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1359,11 +1396,11 @@ DTRMM_L2x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x1_SUB0 + ble .LDTRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x1_SUB4 + ble .LDTRMM_L2x1_SUB4 -DTRMM_L2x1_LOOP_START: +.LDTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1377,11 +1414,11 @@ DTRMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble DTRMM_L2x1_LOOP_END + ble .LDTRMM_L2x1_LOOP_END .align 5 -DTRMM_L2x1_LOOP: +.LDTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1394,9 +1431,9 @@ DTRMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt DTRMM_L2x1_LOOP + bgt .LDTRMM_L2x1_LOOP -DTRMM_L2x1_LOOP_END: +.LDTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1408,9 +1445,9 @@ DTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b DTRMM_L2x1_SUB1 + b .LDTRMM_L2x1_SUB1 -DTRMM_L2x1_SUB4: +.LDTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1422,31 +1459,31 @@ DTRMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b DTRMM_L2x1_SUB1 + b .LDTRMM_L2x1_SUB1 -DTRMM_L2x1_SUB0: +.LDTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble DTRMM_L2x1_SAVE - b DTRMM_L2x1_SUB2 + ble .LDTRMM_L2x1_SAVE + b .LDTRMM_L2x1_SUB2 -DTRMM_L2x1_SUB1: +.LDTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x1_SAVE + ble .LDTRMM_L2x1_SAVE -DTRMM_L2x1_SUB2: +.LDTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt DTRMM_L2x1_SUB2 + bgt .LDTRMM_L2x1_SUB2 -DTRMM_L2x1_SAVE: +.LDTRMM_L2x1_SAVE: SAVE2x1 @@ -1463,7 +1500,7 @@ DTRMM_L2x1_SAVE: #endif -DTRMM_L2x1_END: +.LDTRMM_L2x1_END: slwi T1, K, 4 add B, B, T1 @@ -1473,11 +1510,11 @@ DTRMM_L2x1_END: #endif -DTRMM_L2_END: -DTRMM_L1_BEGIN: +.LDTRMM_L2_END: +.LDTRMM_L1_BEGIN: andi. T1, N, 1 - ble DTRMM_L1_END + ble .LDTRMM_L1_END mr CO, C mr AO, A @@ -1486,9 +1523,9 @@ DTRMM_L1_BEGIN: #endif srawi. I, M, 4 - ble DTRMM_L1x16_END + ble .LDTRMM_L1x16_END -DTRMM_L1x16_BEGIN: +.LDTRMM_L1x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1515,11 +1552,11 @@ DTRMM_L1x16_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x16_SUB0 + ble .LDTRMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x16_SUB4 + ble .LDTRMM_L1x16_SUB4 -DTRMM_L1x16_LOOP_START: +.LDTRMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1542,11 +1579,11 @@ DTRMM_L1x16_LOOP_START: KERNEL1x16_2 addic. L, L, -2 - ble DTRMM_L1x16_LOOP_END + ble .LDTRMM_L1x16_LOOP_END .align 5 -DTRMM_L1x16_LOOP: +.LDTRMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1567,9 +1604,9 @@ DTRMM_L1x16_LOOP: KERNEL1x16_2 addic. L, L, -1 - bgt DTRMM_L1x16_LOOP + bgt .LDTRMM_L1x16_LOOP -DTRMM_L1x16_LOOP_END: +.LDTRMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1588,9 +1625,9 @@ DTRMM_L1x16_LOOP_END: KERNEL1x16_1 KERNEL1x16_E2 - b DTRMM_L1x16_SUB1 + b .LDTRMM_L1x16_SUB1 -DTRMM_L1x16_SUB4: +.LDTRMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1606,31 +1643,31 @@ DTRMM_L1x16_SUB4: KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b DTRMM_L1x16_SUB1 + b .LDTRMM_L1x16_SUB1 -DTRMM_L1x16_SUB0: +.LDTRMM_L1x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x16_SUBI1 addic. L, L, -1 - ble DTRMM_L1x16_SAVE - b DTRMM_L1x16_SUB2 + ble .LDTRMM_L1x16_SAVE + b .LDTRMM_L1x16_SUB2 -DTRMM_L1x16_SUB1: +.LDTRMM_L1x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x16_SAVE + ble .LDTRMM_L1x16_SAVE -DTRMM_L1x16_SUB2: +.LDTRMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt DTRMM_L1x16_SUB2 + bgt .LDTRMM_L1x16_SUB2 -DTRMM_L1x16_SAVE: +.LDTRMM_L1x16_SAVE: SAVE1x16 @@ -1648,16 +1685,16 @@ DTRMM_L1x16_SAVE: addic. I, I, -1 - bgt DTRMM_L1x16_BEGIN + bgt .LDTRMM_L1x16_BEGIN -DTRMM_L1x16_END: +.LDTRMM_L1x16_END: -DTRMM_L1x8_BEGIN: +.LDTRMM_L1x8_BEGIN: andi. T2, M, 15 - ble DTRMM_L1x1_END + ble .LDTRMM_L1x1_END andi. T1, M, 8 - ble DTRMM_L1x8_END + ble .LDTRMM_L1x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1683,11 +1720,11 @@ DTRMM_L1x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x8_SUB0 + ble .LDTRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x8_SUB4 + ble .LDTRMM_L1x8_SUB4 -DTRMM_L1x8_LOOP_START: +.LDTRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1701,11 +1738,11 @@ DTRMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble DTRMM_L1x8_LOOP_END + ble .LDTRMM_L1x8_LOOP_END .align 5 -DTRMM_L1x8_LOOP: +.LDTRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1718,9 +1755,9 @@ DTRMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt DTRMM_L1x8_LOOP + bgt .LDTRMM_L1x8_LOOP -DTRMM_L1x8_LOOP_END: +.LDTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1732,9 +1769,9 @@ DTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b DTRMM_L1x8_SUB1 + b .LDTRMM_L1x8_SUB1 -DTRMM_L1x8_SUB4: +.LDTRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1746,31 +1783,31 @@ DTRMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b DTRMM_L1x8_SUB1 + b .LDTRMM_L1x8_SUB1 -DTRMM_L1x8_SUB0: +.LDTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble DTRMM_L1x8_SAVE - b DTRMM_L1x8_SUB2 + ble .LDTRMM_L1x8_SAVE + b .LDTRMM_L1x8_SUB2 -DTRMM_L1x8_SUB1: +.LDTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x8_SAVE + ble .LDTRMM_L1x8_SAVE -DTRMM_L1x8_SUB2: +.LDTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt DTRMM_L1x8_SUB2 + bgt .LDTRMM_L1x8_SUB2 -DTRMM_L1x8_SAVE: +.LDTRMM_L1x8_SAVE: SAVE1x8 @@ -1787,12 +1824,12 @@ DTRMM_L1x8_SAVE: #endif -DTRMM_L1x8_END: +.LDTRMM_L1x8_END: -DTRMM_L1x4_BEGIN: +.LDTRMM_L1x4_BEGIN: andi. T1, M, 4 - ble DTRMM_L1x4_END + ble .LDTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1818,11 +1855,11 @@ DTRMM_L1x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x4_SUB0 + ble .LDTRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x4_SUB4 + ble .LDTRMM_L1x4_SUB4 -DTRMM_L1x4_LOOP_START: +.LDTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1836,11 +1873,11 @@ DTRMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble DTRMM_L1x4_LOOP_END + ble .LDTRMM_L1x4_LOOP_END .align 5 -DTRMM_L1x4_LOOP: +.LDTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1853,9 +1890,9 @@ DTRMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt DTRMM_L1x4_LOOP + bgt .LDTRMM_L1x4_LOOP -DTRMM_L1x4_LOOP_END: +.LDTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1867,9 +1904,9 @@ DTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b DTRMM_L1x4_SUB1 + b .LDTRMM_L1x4_SUB1 -DTRMM_L1x4_SUB4: +.LDTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1881,31 +1918,31 @@ DTRMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b DTRMM_L1x4_SUB1 + b .LDTRMM_L1x4_SUB1 -DTRMM_L1x4_SUB0: +.LDTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble DTRMM_L1x4_SAVE - b DTRMM_L1x4_SUB2 + ble .LDTRMM_L1x4_SAVE + b .LDTRMM_L1x4_SUB2 -DTRMM_L1x4_SUB1: +.LDTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x4_SAVE + ble .LDTRMM_L1x4_SAVE -DTRMM_L1x4_SUB2: +.LDTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt DTRMM_L1x4_SUB2 + bgt .LDTRMM_L1x4_SUB2 -DTRMM_L1x4_SAVE: +.LDTRMM_L1x4_SAVE: SAVE1x4 @@ -1922,12 +1959,12 @@ DTRMM_L1x4_SAVE: #endif -DTRMM_L1x4_END: +.LDTRMM_L1x4_END: -DTRMM_L1x2_BEGIN: +.LDTRMM_L1x2_BEGIN: andi. T1, M, 2 - ble DTRMM_L1x2_END + ble .LDTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1953,11 +1990,11 @@ DTRMM_L1x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x2_SUB0 + ble .LDTRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x2_SUB4 + ble .LDTRMM_L1x2_SUB4 -DTRMM_L1x2_LOOP_START: +.LDTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1971,11 +2008,11 @@ DTRMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble DTRMM_L1x2_LOOP_END + ble .LDTRMM_L1x2_LOOP_END .align 5 -DTRMM_L1x2_LOOP: +.LDTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1988,9 +2025,9 @@ DTRMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt DTRMM_L1x2_LOOP + bgt .LDTRMM_L1x2_LOOP -DTRMM_L1x2_LOOP_END: +.LDTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -2002,9 +2039,9 @@ DTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b DTRMM_L1x2_SUB1 + b .LDTRMM_L1x2_SUB1 -DTRMM_L1x2_SUB4: +.LDTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -2016,31 +2053,31 @@ DTRMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b DTRMM_L1x2_SUB1 + b .LDTRMM_L1x2_SUB1 -DTRMM_L1x2_SUB0: +.LDTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble DTRMM_L1x2_SAVE - b DTRMM_L1x2_SUB2 + ble .LDTRMM_L1x2_SAVE + b .LDTRMM_L1x2_SUB2 -DTRMM_L1x2_SUB1: +.LDTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x2_SAVE + ble .LDTRMM_L1x2_SAVE -DTRMM_L1x2_SUB2: +.LDTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt DTRMM_L1x2_SUB2 + bgt .LDTRMM_L1x2_SUB2 -DTRMM_L1x2_SAVE: +.LDTRMM_L1x2_SAVE: SAVE1x2 @@ -2057,12 +2094,12 @@ DTRMM_L1x2_SAVE: #endif -DTRMM_L1x2_END: +.LDTRMM_L1x2_END: -DTRMM_L1x1_BEGIN: +.LDTRMM_L1x1_BEGIN: andi. T1, M, 1 - ble DTRMM_L1x1_END + ble .LDTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2088,11 +2125,11 @@ DTRMM_L1x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x1_SUB0 + ble .LDTRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x1_SUB4 + ble .LDTRMM_L1x1_SUB4 -DTRMM_L1x1_LOOP_START: +.LDTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -2106,11 +2143,11 @@ DTRMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble DTRMM_L1x1_LOOP_END + ble .LDTRMM_L1x1_LOOP_END .align 5 -DTRMM_L1x1_LOOP: +.LDTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -2123,9 +2160,9 @@ DTRMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt DTRMM_L1x1_LOOP + bgt .LDTRMM_L1x1_LOOP -DTRMM_L1x1_LOOP_END: +.LDTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -2137,9 +2174,9 @@ DTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b DTRMM_L1x1_SUB1 + b .LDTRMM_L1x1_SUB1 -DTRMM_L1x1_SUB4: +.LDTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -2151,31 +2188,31 @@ DTRMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b DTRMM_L1x1_SUB1 + b .LDTRMM_L1x1_SUB1 -DTRMM_L1x1_SUB0: +.LDTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble DTRMM_L1x1_SAVE - b DTRMM_L1x1_SUB2 + ble .LDTRMM_L1x1_SAVE + b .LDTRMM_L1x1_SUB2 -DTRMM_L1x1_SUB1: +.LDTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x1_SAVE + ble .LDTRMM_L1x1_SAVE -DTRMM_L1x1_SUB2: +.LDTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt DTRMM_L1x1_SUB2 + bgt .LDTRMM_L1x1_SUB2 -DTRMM_L1x1_SAVE: +.LDTRMM_L1x1_SAVE: SAVE1x1 @@ -2192,11 +2229,11 @@ DTRMM_L1x1_SAVE: #endif -DTRMM_L1x1_END: +.LDTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -DTRMM_L1_END: +.LDTRMM_L1_END: diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 03957f406..a7665f749 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -233,11 +268,11 @@ #include "zgemm_macros_8x2_power8.S" cmpwi cr0, M, 0 - ble L999 + ble .L999 cmpwi cr0, N, 0 - ble L999 + ble .L999 cmpwi cr0, K, 0 - ble L999 + ble .L999 slwi LDC, LDC, ZBASE_SHIFT li PRE, 256 @@ -260,7 +295,7 @@ #include "zgemm_logic_8x2_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index e829fd68e..5fcade5bf 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,25 +1,25 @@ srawi. J, N, 1 - ble ZGEMM_L2_END + ble .LZGEMM_L2_END -ZGEMM_L2_BEGIN: +.LZGEMM_L2_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 - ble ZGEMM_L2x8_END + ble .LZGEMM_L2x8_END -ZGEMM_L2x8_BEGIN: +.LZGEMM_L2x8_BEGIN: mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x8_SUB0 + ble .LZGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x8_SUB4 + ble .LZGEMM_L2x8_SUB4 -ZGEMM_L2x8_LOOP_START: +.LZGEMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 @@ -42,11 +42,11 @@ ZGEMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble ZGEMM_L2x8_LOOP_END + ble .LZGEMM_L2x8_LOOP_END .align 5 -ZGEMM_L2x8_LOOP: +.LZGEMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 @@ -67,9 +67,9 @@ ZGEMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt ZGEMM_L2x8_LOOP + bgt .LZGEMM_L2x8_LOOP -ZGEMM_L2x8_LOOP_END: +.LZGEMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 @@ -88,9 +88,9 @@ ZGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b ZGEMM_L2x8_SUB1 + b .LZGEMM_L2x8_SUB1 -ZGEMM_L2x8_SUB4: +.LZGEMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 @@ -106,53 +106,53 @@ ZGEMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b ZGEMM_L2x8_SUB1 + b .LZGEMM_L2x8_SUB1 -ZGEMM_L2x8_SUB0: +.LZGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 + ble .LZGEMM_L2x8_SAVE + b .LZGEMM_L2x8_SUB2 -ZGEMM_L2x8_SUB1: +.LZGEMM_L2x8_SUB1: andi. L, K, 7 - ble ZGEMM_L2x8_SAVE + ble .LZGEMM_L2x8_SAVE -ZGEMM_L2x8_SUB2: +.LZGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x8_SUB2 + bgt .LZGEMM_L2x8_SUB2 -ZGEMM_L2x8_SAVE: +.LZGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 - bgt ZGEMM_L2x8_BEGIN + bgt .LZGEMM_L2x8_BEGIN -ZGEMM_L2x8_END: +.LZGEMM_L2x8_END: -ZGEMM_L2x4_BEGIN: +.LZGEMM_L2x4_BEGIN: andi. T2, M, 7 - ble ZGEMM_L2x1_END + ble .LZGEMM_L2x1_END andi. T1, M, 4 - ble ZGEMM_L2x4_END + ble .LZGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x4_SUB0 + ble .LZGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x4_SUB4 + ble .LZGEMM_L2x4_SUB4 -ZGEMM_L2x4_LOOP_START: +.LZGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -166,11 +166,11 @@ ZGEMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble ZGEMM_L2x4_LOOP_END + ble .LZGEMM_L2x4_LOOP_END .align 5 -ZGEMM_L2x4_LOOP: +.LZGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -183,9 +183,9 @@ ZGEMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt ZGEMM_L2x4_LOOP + bgt .LZGEMM_L2x4_LOOP -ZGEMM_L2x4_LOOP_END: +.LZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -197,9 +197,9 @@ ZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b ZGEMM_L2x4_SUB1 + b .LZGEMM_L2x4_SUB1 -ZGEMM_L2x4_SUB4: +.LZGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -211,48 +211,48 @@ ZGEMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b ZGEMM_L2x4_SUB1 + b .LZGEMM_L2x4_SUB1 -ZGEMM_L2x4_SUB0: +.LZGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 + ble .LZGEMM_L2x4_SAVE + b .LZGEMM_L2x4_SUB2 -ZGEMM_L2x4_SUB1: +.LZGEMM_L2x4_SUB1: andi. L, K, 7 - ble ZGEMM_L2x4_SAVE + ble .LZGEMM_L2x4_SAVE -ZGEMM_L2x4_SUB2: +.LZGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x4_SUB2 + bgt .LZGEMM_L2x4_SUB2 -ZGEMM_L2x4_SAVE: +.LZGEMM_L2x4_SAVE: SAVE2x4 -ZGEMM_L2x4_END: +.LZGEMM_L2x4_END: -ZGEMM_L2x2_BEGIN: +.LZGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble ZGEMM_L2x2_END + ble .LZGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x2_SUB0 + ble .LZGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x2_SUB4 + ble .LZGEMM_L2x2_SUB4 -ZGEMM_L2x2_LOOP_START: +.LZGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -266,11 +266,11 @@ ZGEMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble ZGEMM_L2x2_LOOP_END + ble .LZGEMM_L2x2_LOOP_END .align 5 -ZGEMM_L2x2_LOOP: +.LZGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -283,9 +283,9 @@ ZGEMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt ZGEMM_L2x2_LOOP + bgt .LZGEMM_L2x2_LOOP -ZGEMM_L2x2_LOOP_END: +.LZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -297,9 +297,9 @@ ZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b ZGEMM_L2x2_SUB1 + b .LZGEMM_L2x2_SUB1 -ZGEMM_L2x2_SUB4: +.LZGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -311,48 +311,48 @@ ZGEMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b ZGEMM_L2x2_SUB1 + b .LZGEMM_L2x2_SUB1 -ZGEMM_L2x2_SUB0: +.LZGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x2_SAVE - b ZGEMM_L2x2_SUB2 + ble .LZGEMM_L2x2_SAVE + b .LZGEMM_L2x2_SUB2 -ZGEMM_L2x2_SUB1: +.LZGEMM_L2x2_SUB1: andi. L, K, 7 - ble ZGEMM_L2x2_SAVE + ble .LZGEMM_L2x2_SAVE -ZGEMM_L2x2_SUB2: +.LZGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x2_SUB2 + bgt .LZGEMM_L2x2_SUB2 -ZGEMM_L2x2_SAVE: +.LZGEMM_L2x2_SAVE: SAVE2x2 -ZGEMM_L2x2_END: +.LZGEMM_L2x2_END: -ZGEMM_L2x1_BEGIN: +.LZGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble ZGEMM_L2x1_END + ble .LZGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x1_SUB0 + ble .LZGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x1_SUB4 + ble .LZGEMM_L2x1_SUB4 -ZGEMM_L2x1_LOOP_START: +.LZGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -366,11 +366,11 @@ ZGEMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble ZGEMM_L2x1_LOOP_END + ble .LZGEMM_L2x1_LOOP_END .align 5 -ZGEMM_L2x1_LOOP: +.LZGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -383,9 +383,9 @@ ZGEMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt ZGEMM_L2x1_LOOP + bgt .LZGEMM_L2x1_LOOP -ZGEMM_L2x1_LOOP_END: +.LZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -397,9 +397,9 @@ ZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b ZGEMM_L2x1_SUB1 + b .LZGEMM_L2x1_SUB1 -ZGEMM_L2x1_SUB4: +.LZGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -411,72 +411,72 @@ ZGEMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b ZGEMM_L2x1_SUB1 + b .LZGEMM_L2x1_SUB1 -ZGEMM_L2x1_SUB0: +.LZGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x1_SAVE - b ZGEMM_L2x1_SUB2 + ble .LZGEMM_L2x1_SAVE + b .LZGEMM_L2x1_SUB2 -ZGEMM_L2x1_SUB1: +.LZGEMM_L2x1_SUB1: andi. L, K, 7 - ble ZGEMM_L2x1_SAVE + ble .LZGEMM_L2x1_SAVE -ZGEMM_L2x1_SUB2: +.LZGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x1_SUB2 + bgt .LZGEMM_L2x1_SUB2 -ZGEMM_L2x1_SAVE: +.LZGEMM_L2x1_SAVE: SAVE2x1 -ZGEMM_L2x1_END: +.LZGEMM_L2x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt ZGEMM_L2_BEGIN + bgt .LZGEMM_L2_BEGIN andi. T2, N, 1 - ble L999 + ble .L999 -ZGEMM_L2_END: +.LZGEMM_L2_END: - b ZGEMM_L1_BEGIN + b .LZGEMM_L1_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -ZGEMM_L1_BEGIN: +.LZGEMM_L1_BEGIN: andi. T1, N, 1 - ble ZGEMM_L1_END + ble .LZGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 - ble ZGEMM_L1x8_END + ble .LZGEMM_L1x8_END -ZGEMM_L1x8_BEGIN: +.LZGEMM_L1x8_BEGIN: mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x8_SUB0 + ble .LZGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x8_SUB4 + ble .LZGEMM_L1x8_SUB4 -ZGEMM_L1x8_LOOP_START: +.LZGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -499,11 +499,11 @@ ZGEMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble ZGEMM_L1x8_LOOP_END + ble .LZGEMM_L1x8_LOOP_END .align 5 -ZGEMM_L1x8_LOOP: +.LZGEMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 @@ -524,9 +524,9 @@ ZGEMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt ZGEMM_L1x8_LOOP + bgt .LZGEMM_L1x8_LOOP -ZGEMM_L1x8_LOOP_END: +.LZGEMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 @@ -545,9 +545,9 @@ ZGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b ZGEMM_L1x8_SUB1 + b .LZGEMM_L1x8_SUB1 -ZGEMM_L1x8_SUB4: +.LZGEMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 @@ -563,53 +563,53 @@ ZGEMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b ZGEMM_L1x8_SUB1 + b .LZGEMM_L1x8_SUB1 -ZGEMM_L1x8_SUB0: +.LZGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x8_SAVE - b ZGEMM_L1x8_SUB2 + ble .LZGEMM_L1x8_SAVE + b .LZGEMM_L1x8_SUB2 -ZGEMM_L1x8_SUB1: +.LZGEMM_L1x8_SUB1: andi. L, K, 7 - ble ZGEMM_L1x8_SAVE + ble .LZGEMM_L1x8_SAVE -ZGEMM_L1x8_SUB2: +.LZGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x8_SUB2 + bgt .LZGEMM_L1x8_SUB2 -ZGEMM_L1x8_SAVE: +.LZGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 - bgt ZGEMM_L1x8_BEGIN + bgt .LZGEMM_L1x8_BEGIN -ZGEMM_L1x8_END: +.LZGEMM_L1x8_END: -ZGEMM_L1x4_BEGIN: +.LZGEMM_L1x4_BEGIN: andi. T2, M, 7 - ble ZGEMM_L1x1_END + ble .LZGEMM_L1x1_END andi. T1, M, 4 - ble ZGEMM_L1x4_END + ble .LZGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x4_SUB0 + ble .LZGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x4_SUB4 + ble .LZGEMM_L1x4_SUB4 -ZGEMM_L1x4_LOOP_START: +.LZGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -623,11 +623,11 @@ ZGEMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble ZGEMM_L1x4_LOOP_END + ble .LZGEMM_L1x4_LOOP_END .align 5 -ZGEMM_L1x4_LOOP: +.LZGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -640,9 +640,9 @@ ZGEMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt ZGEMM_L1x4_LOOP + bgt .LZGEMM_L1x4_LOOP -ZGEMM_L1x4_LOOP_END: +.LZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -654,9 +654,9 @@ ZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b ZGEMM_L1x4_SUB1 + b .LZGEMM_L1x4_SUB1 -ZGEMM_L1x4_SUB4: +.LZGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -668,48 +668,48 @@ ZGEMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b ZGEMM_L1x4_SUB1 + b .LZGEMM_L1x4_SUB1 -ZGEMM_L1x4_SUB0: +.LZGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x4_SAVE - b ZGEMM_L1x4_SUB2 + ble .LZGEMM_L1x4_SAVE + b .LZGEMM_L1x4_SUB2 -ZGEMM_L1x4_SUB1: +.LZGEMM_L1x4_SUB1: andi. L, K, 7 - ble ZGEMM_L1x4_SAVE + ble .LZGEMM_L1x4_SAVE -ZGEMM_L1x4_SUB2: +.LZGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x4_SUB2 + bgt .LZGEMM_L1x4_SUB2 -ZGEMM_L1x4_SAVE: +.LZGEMM_L1x4_SAVE: SAVE1x4 -ZGEMM_L1x4_END: +.LZGEMM_L1x4_END: -ZGEMM_L1x2_BEGIN: +.LZGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble ZGEMM_L1x2_END + ble .LZGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x2_SUB0 + ble .LZGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x2_SUB4 + ble .LZGEMM_L1x2_SUB4 -ZGEMM_L1x2_LOOP_START: +.LZGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -723,11 +723,11 @@ ZGEMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble ZGEMM_L1x2_LOOP_END + ble .LZGEMM_L1x2_LOOP_END .align 5 -ZGEMM_L1x2_LOOP: +.LZGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -740,9 +740,9 @@ ZGEMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt ZGEMM_L1x2_LOOP + bgt .LZGEMM_L1x2_LOOP -ZGEMM_L1x2_LOOP_END: +.LZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -754,9 +754,9 @@ ZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b ZGEMM_L1x2_SUB1 + b .LZGEMM_L1x2_SUB1 -ZGEMM_L1x2_SUB4: +.LZGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -768,48 +768,48 @@ ZGEMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b ZGEMM_L1x2_SUB1 + b .LZGEMM_L1x2_SUB1 -ZGEMM_L1x2_SUB0: +.LZGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x2_SAVE - b ZGEMM_L1x2_SUB2 + ble .LZGEMM_L1x2_SAVE + b .LZGEMM_L1x2_SUB2 -ZGEMM_L1x2_SUB1: +.LZGEMM_L1x2_SUB1: andi. L, K, 7 - ble ZGEMM_L1x2_SAVE + ble .LZGEMM_L1x2_SAVE -ZGEMM_L1x2_SUB2: +.LZGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x2_SUB2 + bgt .LZGEMM_L1x2_SUB2 -ZGEMM_L1x2_SAVE: +.LZGEMM_L1x2_SAVE: SAVE1x2 -ZGEMM_L1x2_END: +.LZGEMM_L1x2_END: -ZGEMM_L1x1_BEGIN: +.LZGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble ZGEMM_L1x1_END + ble .LZGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x1_SUB0 + ble .LZGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x1_SUB4 + ble .LZGEMM_L1x1_SUB4 -ZGEMM_L1x1_LOOP_START: +.LZGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -823,11 +823,11 @@ ZGEMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble ZGEMM_L1x1_LOOP_END + ble .LZGEMM_L1x1_LOOP_END .align 5 -ZGEMM_L1x1_LOOP: +.LZGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -840,9 +840,9 @@ ZGEMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt ZGEMM_L1x1_LOOP + bgt .LZGEMM_L1x1_LOOP -ZGEMM_L1x1_LOOP_END: +.LZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -854,9 +854,9 @@ ZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b ZGEMM_L1x1_SUB1 + b .LZGEMM_L1x1_SUB1 -ZGEMM_L1x1_SUB4: +.LZGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -868,34 +868,34 @@ ZGEMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b ZGEMM_L1x1_SUB1 + b .LZGEMM_L1x1_SUB1 -ZGEMM_L1x1_SUB0: +.LZGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x1_SAVE - b ZGEMM_L1x1_SUB2 + ble .LZGEMM_L1x1_SAVE + b .LZGEMM_L1x1_SUB2 -ZGEMM_L1x1_SUB1: +.LZGEMM_L1x1_SUB1: andi. L, K, 7 - ble ZGEMM_L1x1_SAVE + ble .LZGEMM_L1x1_SAVE -ZGEMM_L1x1_SUB2: +.LZGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x1_SUB2 + bgt .LZGEMM_L1x1_SUB2 -ZGEMM_L1x1_SAVE: +.LZGEMM_L1x1_SAVE: SAVE1x1 -ZGEMM_L1x1_END: +.LZGEMM_L1x1_END: -ZGEMM_L1_END: +.LZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index 3e5ea9ce8..701ec65c8 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index dbbc8f9ac..8b953765e 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -239,11 +274,11 @@ #include "zgemm_macros_8x2_power8.S" cmpwi cr0, M, 0 - ble L999 + ble .L999 cmpwi cr0, N, 0 - ble L999 + ble .L999 cmpwi cr0, K, 0 - ble L999 + ble .L999 slwi LDC, LDC, ZBASE_SHIFT li PRE, 256 @@ -266,7 +301,7 @@ #include "ztrmm_logic_8x2_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S index e250dfac5..f422b17b1 100644 --- a/kernel/power/ztrmm_logic_8x2_power8.S +++ b/kernel/power/ztrmm_logic_8x2_power8.S @@ -1,7 +1,43 @@ - srawi. J, N, 1 - ble ZTRMM_L2_END +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ -ZTRMM_L2_BEGIN: +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 1 + ble .LZTRMM_L2_END + +.LZTRMM_L2_BEGIN: mr CO, C mr AO, A @@ -13,9 +49,9 @@ ZTRMM_L2_BEGIN: #endif srawi. I, M, 3 - ble ZTRMM_L2x8_END + ble .LZTRMM_L2x8_END -ZTRMM_L2x8_BEGIN: +.LZTRMM_L2x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -42,11 +78,11 @@ ZTRMM_L2x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x8_SUB0 + ble .LZTRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x8_SUB4 + ble .LZTRMM_L2x8_SUB4 -ZTRMM_L2x8_LOOP_START: +.LZTRMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 @@ -69,11 +105,11 @@ ZTRMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble ZTRMM_L2x8_LOOP_END + ble .LZTRMM_L2x8_LOOP_END .align 5 -ZTRMM_L2x8_LOOP: +.LZTRMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 @@ -94,9 +130,9 @@ ZTRMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt ZTRMM_L2x8_LOOP + bgt .LZTRMM_L2x8_LOOP -ZTRMM_L2x8_LOOP_END: +.LZTRMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 @@ -115,9 +151,9 @@ ZTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b ZTRMM_L2x8_SUB1 + b .LZTRMM_L2x8_SUB1 -ZTRMM_L2x8_SUB4: +.LZTRMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 @@ -133,31 +169,31 @@ ZTRMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b ZTRMM_L2x8_SUB1 + b .LZTRMM_L2x8_SUB1 -ZTRMM_L2x8_SUB0: +.LZTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x8_SAVE - b ZTRMM_L2x8_SUB2 + ble .LZTRMM_L2x8_SAVE + b .LZTRMM_L2x8_SUB2 -ZTRMM_L2x8_SUB1: +.LZTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x8_SAVE + ble .LZTRMM_L2x8_SAVE -ZTRMM_L2x8_SUB2: +.LZTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x8_SUB2 + bgt .LZTRMM_L2x8_SUB2 -ZTRMM_L2x8_SAVE: +.LZTRMM_L2x8_SAVE: SAVE2x8 @@ -175,16 +211,16 @@ ZTRMM_L2x8_SAVE: addic. I, I, -1 - bgt ZTRMM_L2x8_BEGIN + bgt .LZTRMM_L2x8_BEGIN -ZTRMM_L2x8_END: +.LZTRMM_L2x8_END: -ZTRMM_L2x4_BEGIN: +.LZTRMM_L2x4_BEGIN: andi. T2, M, 7 - ble ZTRMM_L2x1_END + ble .LZTRMM_L2x1_END andi. T1, M, 4 - ble ZTRMM_L2x4_END + ble .LZTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -210,11 +246,11 @@ ZTRMM_L2x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x4_SUB0 + ble .LZTRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x4_SUB4 + ble .LZTRMM_L2x4_SUB4 -ZTRMM_L2x4_LOOP_START: +.LZTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -228,11 +264,11 @@ ZTRMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble ZTRMM_L2x4_LOOP_END + ble .LZTRMM_L2x4_LOOP_END .align 5 -ZTRMM_L2x4_LOOP: +.LZTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -245,9 +281,9 @@ ZTRMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt ZTRMM_L2x4_LOOP + bgt .LZTRMM_L2x4_LOOP -ZTRMM_L2x4_LOOP_END: +.LZTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -259,9 +295,9 @@ ZTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b ZTRMM_L2x4_SUB1 + b .LZTRMM_L2x4_SUB1 -ZTRMM_L2x4_SUB4: +.LZTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -273,31 +309,31 @@ ZTRMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b ZTRMM_L2x4_SUB1 + b .LZTRMM_L2x4_SUB1 -ZTRMM_L2x4_SUB0: +.LZTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x4_SAVE - b ZTRMM_L2x4_SUB2 + ble .LZTRMM_L2x4_SAVE + b .LZTRMM_L2x4_SUB2 -ZTRMM_L2x4_SUB1: +.LZTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x4_SAVE + ble .LZTRMM_L2x4_SAVE -ZTRMM_L2x4_SUB2: +.LZTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x4_SUB2 + bgt .LZTRMM_L2x4_SUB2 -ZTRMM_L2x4_SAVE: +.LZTRMM_L2x4_SAVE: SAVE2x4 @@ -314,12 +350,12 @@ ZTRMM_L2x4_SAVE: #endif -ZTRMM_L2x4_END: +.LZTRMM_L2x4_END: -ZTRMM_L2x2_BEGIN: +.LZTRMM_L2x2_BEGIN: andi. T1, M, 2 - ble ZTRMM_L2x2_END + ble .LZTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -345,11 +381,11 @@ ZTRMM_L2x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x2_SUB0 + ble .LZTRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x2_SUB4 + ble .LZTRMM_L2x2_SUB4 -ZTRMM_L2x2_LOOP_START: +.LZTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -363,11 +399,11 @@ ZTRMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble ZTRMM_L2x2_LOOP_END + ble .LZTRMM_L2x2_LOOP_END .align 5 -ZTRMM_L2x2_LOOP: +.LZTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -380,9 +416,9 @@ ZTRMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt ZTRMM_L2x2_LOOP + bgt .LZTRMM_L2x2_LOOP -ZTRMM_L2x2_LOOP_END: +.LZTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -394,9 +430,9 @@ ZTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b ZTRMM_L2x2_SUB1 + b .LZTRMM_L2x2_SUB1 -ZTRMM_L2x2_SUB4: +.LZTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -408,31 +444,31 @@ ZTRMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b ZTRMM_L2x2_SUB1 + b .LZTRMM_L2x2_SUB1 -ZTRMM_L2x2_SUB0: +.LZTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x2_SAVE - b ZTRMM_L2x2_SUB2 + ble .LZTRMM_L2x2_SAVE + b .LZTRMM_L2x2_SUB2 -ZTRMM_L2x2_SUB1: +.LZTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x2_SAVE + ble .LZTRMM_L2x2_SAVE -ZTRMM_L2x2_SUB2: +.LZTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x2_SUB2 + bgt .LZTRMM_L2x2_SUB2 -ZTRMM_L2x2_SAVE: +.LZTRMM_L2x2_SAVE: SAVE2x2 @@ -449,12 +485,12 @@ ZTRMM_L2x2_SAVE: #endif -ZTRMM_L2x2_END: +.LZTRMM_L2x2_END: -ZTRMM_L2x1_BEGIN: +.LZTRMM_L2x1_BEGIN: andi. T1, M, 1 - ble ZTRMM_L2x1_END + ble .LZTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -480,11 +516,11 @@ ZTRMM_L2x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x1_SUB0 + ble .LZTRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x1_SUB4 + ble .LZTRMM_L2x1_SUB4 -ZTRMM_L2x1_LOOP_START: +.LZTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -498,11 +534,11 @@ ZTRMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble ZTRMM_L2x1_LOOP_END + ble .LZTRMM_L2x1_LOOP_END .align 5 -ZTRMM_L2x1_LOOP: +.LZTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -515,9 +551,9 @@ ZTRMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt ZTRMM_L2x1_LOOP + bgt .LZTRMM_L2x1_LOOP -ZTRMM_L2x1_LOOP_END: +.LZTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -529,9 +565,9 @@ ZTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b ZTRMM_L2x1_SUB1 + b .LZTRMM_L2x1_SUB1 -ZTRMM_L2x1_SUB4: +.LZTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -543,31 +579,31 @@ ZTRMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b ZTRMM_L2x1_SUB1 + b .LZTRMM_L2x1_SUB1 -ZTRMM_L2x1_SUB0: +.LZTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x1_SAVE - b ZTRMM_L2x1_SUB2 + ble .LZTRMM_L2x1_SAVE + b .LZTRMM_L2x1_SUB2 -ZTRMM_L2x1_SUB1: +.LZTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x1_SAVE + ble .LZTRMM_L2x1_SAVE -ZTRMM_L2x1_SUB2: +.LZTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x1_SUB2 + bgt .LZTRMM_L2x1_SUB2 -ZTRMM_L2x1_SAVE: +.LZTRMM_L2x1_SAVE: SAVE2x1 @@ -584,7 +620,7 @@ ZTRMM_L2x1_SAVE: #endif -ZTRMM_L2x1_END: +.LZTRMM_L2x1_END: slwi T1, K, 5 add B, B, T1 @@ -595,23 +631,23 @@ ZTRMM_L2x1_END: addic. J, J, -1 - bgt ZTRMM_L2_BEGIN + bgt .LZTRMM_L2_BEGIN andi. T2, N, 1 - ble L999 + ble .L999 -ZTRMM_L2_END: +.LZTRMM_L2_END: - b ZTRMM_L1_BEGIN + b .LZTRMM_L1_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -ZTRMM_L1_BEGIN: +.LZTRMM_L1_BEGIN: andi. T1, N, 1 - ble ZTRMM_L1_END + ble .LZTRMM_L1_END mr CO, C mr AO, A @@ -620,9 +656,9 @@ ZTRMM_L1_BEGIN: #endif srawi. I, M, 3 - ble ZTRMM_L1x8_END + ble .LZTRMM_L1x8_END -ZTRMM_L1x8_BEGIN: +.LZTRMM_L1x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -649,11 +685,11 @@ ZTRMM_L1x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x8_SUB0 + ble .LZTRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x8_SUB4 + ble .LZTRMM_L1x8_SUB4 -ZTRMM_L1x8_LOOP_START: +.LZTRMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -676,11 +712,11 @@ ZTRMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble ZTRMM_L1x8_LOOP_END + ble .LZTRMM_L1x8_LOOP_END .align 5 -ZTRMM_L1x8_LOOP: +.LZTRMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 @@ -701,9 +737,9 @@ ZTRMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt ZTRMM_L1x8_LOOP + bgt .LZTRMM_L1x8_LOOP -ZTRMM_L1x8_LOOP_END: +.LZTRMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 @@ -722,9 +758,9 @@ ZTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b ZTRMM_L1x8_SUB1 + b .LZTRMM_L1x8_SUB1 -ZTRMM_L1x8_SUB4: +.LZTRMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 @@ -740,31 +776,31 @@ ZTRMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b ZTRMM_L1x8_SUB1 + b .LZTRMM_L1x8_SUB1 -ZTRMM_L1x8_SUB0: +.LZTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x8_SAVE - b ZTRMM_L1x8_SUB2 + ble .LZTRMM_L1x8_SAVE + b .LZTRMM_L1x8_SUB2 -ZTRMM_L1x8_SUB1: +.LZTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x8_SAVE + ble .LZTRMM_L1x8_SAVE -ZTRMM_L1x8_SUB2: +.LZTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x8_SUB2 + bgt .LZTRMM_L1x8_SUB2 -ZTRMM_L1x8_SAVE: +.LZTRMM_L1x8_SAVE: SAVE1x8 @@ -782,16 +818,16 @@ ZTRMM_L1x8_SAVE: addic. I, I, -1 - bgt ZTRMM_L1x8_BEGIN + bgt .LZTRMM_L1x8_BEGIN -ZTRMM_L1x8_END: +.LZTRMM_L1x8_END: -ZTRMM_L1x4_BEGIN: +.LZTRMM_L1x4_BEGIN: andi. T2, M, 7 - ble ZTRMM_L1x1_END + ble .LZTRMM_L1x1_END andi. T1, M, 4 - ble ZTRMM_L1x4_END + ble .LZTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -817,11 +853,11 @@ ZTRMM_L1x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x4_SUB0 + ble .LZTRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x4_SUB4 + ble .LZTRMM_L1x4_SUB4 -ZTRMM_L1x4_LOOP_START: +.LZTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -835,11 +871,11 @@ ZTRMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble ZTRMM_L1x4_LOOP_END + ble .LZTRMM_L1x4_LOOP_END .align 5 -ZTRMM_L1x4_LOOP: +.LZTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -852,9 +888,9 @@ ZTRMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt ZTRMM_L1x4_LOOP + bgt .LZTRMM_L1x4_LOOP -ZTRMM_L1x4_LOOP_END: +.LZTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -866,9 +902,9 @@ ZTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b ZTRMM_L1x4_SUB1 + b .LZTRMM_L1x4_SUB1 -ZTRMM_L1x4_SUB4: +.LZTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -880,31 +916,31 @@ ZTRMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b ZTRMM_L1x4_SUB1 + b .LZTRMM_L1x4_SUB1 -ZTRMM_L1x4_SUB0: +.LZTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x4_SAVE - b ZTRMM_L1x4_SUB2 + ble .LZTRMM_L1x4_SAVE + b .LZTRMM_L1x4_SUB2 -ZTRMM_L1x4_SUB1: +.LZTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x4_SAVE + ble .LZTRMM_L1x4_SAVE -ZTRMM_L1x4_SUB2: +.LZTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x4_SUB2 + bgt .LZTRMM_L1x4_SUB2 -ZTRMM_L1x4_SAVE: +.LZTRMM_L1x4_SAVE: SAVE1x4 @@ -921,12 +957,12 @@ ZTRMM_L1x4_SAVE: #endif -ZTRMM_L1x4_END: +.LZTRMM_L1x4_END: -ZTRMM_L1x2_BEGIN: +.LZTRMM_L1x2_BEGIN: andi. T1, M, 2 - ble ZTRMM_L1x2_END + ble .LZTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -952,11 +988,11 @@ ZTRMM_L1x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x2_SUB0 + ble .LZTRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x2_SUB4 + ble .LZTRMM_L1x2_SUB4 -ZTRMM_L1x2_LOOP_START: +.LZTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -970,11 +1006,11 @@ ZTRMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble ZTRMM_L1x2_LOOP_END + ble .LZTRMM_L1x2_LOOP_END .align 5 -ZTRMM_L1x2_LOOP: +.LZTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -987,9 +1023,9 @@ ZTRMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt ZTRMM_L1x2_LOOP + bgt .LZTRMM_L1x2_LOOP -ZTRMM_L1x2_LOOP_END: +.LZTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1001,9 +1037,9 @@ ZTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b ZTRMM_L1x2_SUB1 + b .LZTRMM_L1x2_SUB1 -ZTRMM_L1x2_SUB4: +.LZTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1015,31 +1051,31 @@ ZTRMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b ZTRMM_L1x2_SUB1 + b .LZTRMM_L1x2_SUB1 -ZTRMM_L1x2_SUB0: +.LZTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x2_SAVE - b ZTRMM_L1x2_SUB2 + ble .LZTRMM_L1x2_SAVE + b .LZTRMM_L1x2_SUB2 -ZTRMM_L1x2_SUB1: +.LZTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x2_SAVE + ble .LZTRMM_L1x2_SAVE -ZTRMM_L1x2_SUB2: +.LZTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x2_SUB2 + bgt .LZTRMM_L1x2_SUB2 -ZTRMM_L1x2_SAVE: +.LZTRMM_L1x2_SAVE: SAVE1x2 @@ -1056,12 +1092,12 @@ ZTRMM_L1x2_SAVE: #endif -ZTRMM_L1x2_END: +.LZTRMM_L1x2_END: -ZTRMM_L1x1_BEGIN: +.LZTRMM_L1x1_BEGIN: andi. T1, M, 1 - ble ZTRMM_L1x1_END + ble .LZTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1087,11 +1123,11 @@ ZTRMM_L1x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x1_SUB0 + ble .LZTRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x1_SUB4 + ble .LZTRMM_L1x1_SUB4 -ZTRMM_L1x1_LOOP_START: +.LZTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1105,11 +1141,11 @@ ZTRMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble ZTRMM_L1x1_LOOP_END + ble .LZTRMM_L1x1_LOOP_END .align 5 -ZTRMM_L1x1_LOOP: +.LZTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1122,9 +1158,9 @@ ZTRMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt ZTRMM_L1x1_LOOP + bgt .LZTRMM_L1x1_LOOP -ZTRMM_L1x1_LOOP_END: +.LZTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1136,9 +1172,9 @@ ZTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b ZTRMM_L1x1_SUB1 + b .LZTRMM_L1x1_SUB1 -ZTRMM_L1x1_SUB4: +.LZTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1150,31 +1186,31 @@ ZTRMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b ZTRMM_L1x1_SUB1 + b .LZTRMM_L1x1_SUB1 -ZTRMM_L1x1_SUB0: +.LZTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x1_SAVE - b ZTRMM_L1x1_SUB2 + ble .LZTRMM_L1x1_SAVE + b .LZTRMM_L1x1_SUB2 -ZTRMM_L1x1_SUB1: +.LZTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x1_SAVE + ble .LZTRMM_L1x1_SAVE -ZTRMM_L1x1_SUB2: +.LZTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x1_SUB2 + bgt .LZTRMM_L1x1_SUB2 -ZTRMM_L1x1_SAVE: +.LZTRMM_L1x1_SAVE: SAVE1x1 @@ -1191,11 +1227,11 @@ ZTRMM_L1x1_SAVE: #endif -ZTRMM_L1x1_END: +.LZTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -ZTRMM_L1_END: +.LZTRMM_L1_END: From d23c7c713cf236fb9ddc92ac63e8af7605f6866b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 5 Mar 2016 09:34:37 -0500 Subject: [PATCH 18/37] Fixed #789 Fix utest/ctest.h on Mingw. --- utest/ctest.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utest/ctest.h b/utest/ctest.h index 6d859bc4f..a62103ff5 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -58,6 +58,10 @@ struct ctest { #define __CTEST_APPLE #endif +#ifdef __MINGW32__ +#undef CTEST_SEGFAULT +#endif + #if defined(_WIN32) && defined(_MSC_VER) #define __CTEST_MSVC #endif From 53ba1a77c854794aafd368567505d4bd5aa8d7d7 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Sat, 5 Mar 2016 19:07:03 +0100 Subject: [PATCH 19/37] ztrmv_L.c: no longer need a 4kB buffer Fix #786 --- driver/level2/ztrmv_L.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c index 92c86aec2..2d5fb7802 100644 --- a/driver/level2/ztrmv_L.c +++ b/driver/level2/ztrmv_L.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } From 8d652f11e7c6a728e54cc57a32a45d5516cc916e Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 6 Mar 2016 08:40:51 +0100 Subject: [PATCH 20/37] updated smallscaling.c to build without C99 or C11 increased the threshold value of nep.in to 40 --- benchmark/smallscaling.c | 13 +++++++++---- lapack-netlib/TESTING/nep.in | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c index daed8f3da..9068c61b1 100644 --- a/benchmark/smallscaling.c +++ b/benchmark/smallscaling.c @@ -23,28 +23,32 @@ typedef struct { void * s_create_matrix(int size) { float * r = malloc(size * sizeof(double)); - for(int i = 0; i < size; i++) + int i; + for(i = 0; i < size; i++) r[i] = 1e3 * i / size; return r; } void * c_create_matrix(int size) { float * r = malloc(size * 2 * sizeof(double)); - for(int i = 0; i < 2 * size; i++) + int i; + for(i = 0; i < 2 * size; i++) r[i] = 1e3 * i / size; return r; } void * z_create_matrix(int size) { double * r = malloc(size * 2 * sizeof(double)); - for(int i = 0; i < 2 * size; i++) + int i; + for(i = 0; i < 2 * size; i++) r[i] = 1e3 * i / size; return r; } void * d_create_matrix(int size) { double * r = malloc(size * sizeof(double)); - for(int i = 0; i < size; i++) + int i; + for(i = 0; i < size; i++) r[i] = 1e3 * i / size; return r; } @@ -188,4 +192,5 @@ int main(int argc, char * argv[]) { size *= inc_factor; } } + return(0); } diff --git a/lapack-netlib/TESTING/nep.in b/lapack-netlib/TESTING/nep.in index ed6869b80..af427fbde 100644 --- a/lapack-netlib/TESTING/nep.in +++ b/lapack-netlib/TESTING/nep.in @@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines 0 5 7 3 200 Values of INIBL (nibble crossover point) 1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts) 0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2) -30.0 Threshold value +40.0 Threshold value T Put T to test the error exits 1 Code to interpret the seed NEP 21 From cd5241d0cfffd9f04991cfb994012ec9779183ed Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 6 Mar 2016 09:07:24 +0100 Subject: [PATCH 21/37] modified KERNEL for power, to use the generic DSDOT-KERNEL --- kernel/power/KERNEL | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index 3ec0aaa58..c3c86b310 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -46,3 +46,7 @@ ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta.S endif +ifndef DSDOTKERNEL +DSDOTKERNEL = ../generic/dot.c +endif + From 0bbca5e80392625e1af911cc8730c4928b412750 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 6 Mar 2016 11:54:41 +0100 Subject: [PATCH 22/37] removed build of smallscaling, because build on arm, arm64 and power fails --- benchmark/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index bcf3da2cc..6354b956a 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -166,8 +166,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling + ssymm.goto dsymm.goto csymm.goto zsymm.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ From 7d6b68eb4ac44071312c93fb715af844c5ced3e7 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 7 Mar 2016 11:34:58 +0800 Subject: [PATCH 23/37] Refs #786. Revert to default assembly kernel. --- kernel/x86_64/KERNEL | 6 +++--- kernel/x86_64/KERNEL.BARCELONA | 3 +++ kernel/x86_64/KERNEL.BULLDOZER | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4874711bb..97edec0ac 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -389,7 +389,7 @@ DGEMVTKERNEL = dgemv_t.S endif ifndef CGEMVNKERNEL -CGEMVNKERNEL = cgemv_n_4.c +CGEMVNKERNEL = cgemv_n.S endif ifndef CGEMVTKERNEL @@ -397,11 +397,11 @@ CGEMVTKERNEL = cgemv_t_4.c endif ifndef ZGEMVNKERNEL -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = zgemv_n.S endif ifndef ZGEMVTKERNEL -ZGEMVTKERNEL = zgemv_t_4.c +ZGEMVTKERNEL = zgemv_t.S endif ifndef QGEMVNKERNEL diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA index 70f3d6058..313c62d7c 100644 --- a/kernel/x86_64/KERNEL.BARCELONA +++ b/kernel/x86_64/KERNEL.BARCELONA @@ -1,3 +1,6 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t.S + SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 90834d9ca..c8ccae1ea 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S From acdff55a6ae0915b657be0ea282501d390332175 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 7 Mar 2016 09:39:34 +0100 Subject: [PATCH 24/37] Bugfix for ztrmv --- kernel/x86_64/KERNEL | 6 +++--- kernel/x86_64/KERNEL.BARCELONA | 3 --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 97edec0ac..4874711bb 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -389,7 +389,7 @@ DGEMVTKERNEL = dgemv_t.S endif ifndef CGEMVNKERNEL -CGEMVNKERNEL = cgemv_n.S +CGEMVNKERNEL = cgemv_n_4.c endif ifndef CGEMVTKERNEL @@ -397,11 +397,11 @@ CGEMVTKERNEL = cgemv_t_4.c endif ifndef ZGEMVNKERNEL -ZGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n_4.c endif ifndef ZGEMVTKERNEL -ZGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t_4.c endif ifndef QGEMVNKERNEL diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA index 313c62d7c..70f3d6058 100644 --- a/kernel/x86_64/KERNEL.BARCELONA +++ b/kernel/x86_64/KERNEL.BARCELONA @@ -1,6 +1,3 @@ -ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.S - SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index c8ccae1ea..90834d9ca 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S From 26b3b3a3e6e0db12a5d7b24d2cc99a23b02f445b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 7 Mar 2016 10:10:00 +0100 Subject: [PATCH 25/37] bugfixes form lapack svn for bugs #142 - #155 --- lapack-netlib/LAPACKE/src/lapacke_clantr.c | 6 +- lapack-netlib/LAPACKE/src/lapacke_dlantr.c | 6 +- .../LAPACKE/src/lapacke_dlantr_work.c | 4 +- .../LAPACKE/src/lapacke_dormbr_work.c | 11 +- .../LAPACKE/src/lapacke_dormlq_work.c | 7 +- lapack-netlib/LAPACKE/src/lapacke_slantr.c | 6 +- .../LAPACKE/src/lapacke_slantr_work.c | 2 +- .../LAPACKE/src/lapacke_sormbr_work.c | 7 +- .../LAPACKE/src/lapacke_sormlq_work.c | 6 +- lapack-netlib/LAPACKE/src/lapacke_zlantr.c | 6 +- .../LAPACKE/src/lapacke_zlantr_work.c | 2 +- lapack-netlib/SRC/cgeev.f | 4 +- lapack-netlib/SRC/cgesvdx.f | 85 ++++++----- lapack-netlib/SRC/cgetc2.f | 20 ++- lapack-netlib/SRC/cggev3.f | 8 +- lapack-netlib/SRC/dgeev.f | 4 +- lapack-netlib/SRC/dgesvdx.f | 74 +++++++--- lapack-netlib/SRC/dgetc2.f | 20 ++- lapack-netlib/SRC/sgeev.f | 4 +- lapack-netlib/SRC/sgesvdx.f | 74 +++++++--- lapack-netlib/SRC/sgetc2.f | 20 ++- lapack-netlib/SRC/zgeev.f | 4 +- lapack-netlib/SRC/zgesvdx.f | 132 ++++++++++-------- lapack-netlib/SRC/zgetc2.f | 20 ++- lapack-netlib/SRC/zggev3.f | 4 +- 25 files changed, 357 insertions(+), 179 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr.c b/lapack-netlib/LAPACKE/src/lapacke_clantr.c index 4ba436753..33e6e57ff 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr.c @@ -51,8 +51,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, } #endif /* Allocate memory for working array(s) */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -63,8 +62,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, res = LAPACKE_clantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, work ); /* Release memory and exit */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c index 65802f3e3..8fd112084 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c @@ -51,8 +51,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag, } #endif /* Allocate memory for working array(s) */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -63,8 +62,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag, res = LAPACKE_dlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, work ); /* Release memory and exit */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c index 59eef3801..ee5e665ee 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c @@ -38,10 +38,10 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, const double* a, lapack_int lda, double* work ) { lapack_int info = 0; - double res = 0.; + double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); + res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c index 9db92ce98..dcd8842fa 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c @@ -74,11 +74,10 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side, } /* Allocate memory for temporary array(s) */ if( LAPACKE_lsame( vect, 'q' ) ) { - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * k ); + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,k) ); } else { - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * nq ); + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,nq) ); } - if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; @@ -89,11 +88,7 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side, goto exit_level_1; } /* Transpose input matrices */ - if( LAPACKE_lsame( vect, 'q' ) ) { - LAPACKE_dge_trans( matrix_layout, nq, k, a, lda, a_t, lda_t ); - } else { - LAPACKE_dge_trans( matrix_layout, k, nq, a, lda, a_t, lda_t ); - } + LAPACKE_dge_trans( matrix_layout, r, MIN(nq,k), a, lda, a_t, lda_t ); LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_dormbr( &vect, &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c index 2a59cd56a..f46c6d3b1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c @@ -87,12 +87,7 @@ lapack_int LAPACKE_dormlq_work( int matrix_layout, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - if( LAPACKE_lsame( side, 'l' ) ){ - LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - } else { - LAPACKE_dge_trans( matrix_layout, k, n, a, lda, a_t, lda_t ); - } - + LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_dormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr.c b/lapack-netlib/LAPACKE/src/lapacke_slantr.c index 99942b352..23b19b15d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr.c @@ -51,8 +51,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag, } #endif /* Allocate memory for working array(s) */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -63,8 +62,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag, res = LAPACKE_slantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, work ); /* Release memory and exit */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c index 79c71a00f..92a0e4017 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c @@ -41,7 +41,7 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); + res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c b/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c index f7a2ff3d0..34bc41f30 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c @@ -73,8 +73,11 @@ lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side, return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ - a_t = (float*) - LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MIN(nq,k)) ); + if( LAPACKE_lsame( vect, 'q' ) ) { + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,k) ); + } else { + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,nq) ); + } if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c index a277436cd..b02a2d100 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c @@ -72,7 +72,11 @@ lapack_int LAPACKE_sormlq_work( int matrix_layout, char side, char trans, return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + if( LAPACKE_lsame( side, 'l' ) ) { + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + } else { + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + } if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c index b2c637101..2b645e750 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c @@ -51,8 +51,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag, } #endif /* Allocate memory for working array(s) */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -63,8 +62,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag, res = LAPACKE_zlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, work ); /* Release memory and exit */ - if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, 'O' ) ) { + if( LAPACKE_lsame( norm, 'i' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c index 5fd9f8442..0988bf6e8 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c @@ -39,7 +39,7 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, double* work ) { lapack_int info = 0; - double res = 0.; + double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f index b79b64544..0f48322a8 100644 --- a/lapack-netlib/SRC/cgeev.f +++ b/lapack-netlib/SRC/cgeev.f @@ -405,9 +405,9 @@ $ WORK( IWRK ), LWORK-IWRK+1, INFO ) END IF * -* If INFO > 0 from CHSEQR, then quit +* If INFO .NE. 0 from CHSEQR, then quit * - IF( INFO.GT.0 ) + IF( INFO.NE.0 ) $ GO TO 50 * IF( WANTVL .OR. WANTVR ) THEN diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f index 235426ad4..87ea9861d 100644 --- a/lapack-netlib/SRC/cgesvdx.f +++ b/lapack-netlib/SRC/cgesvdx.f @@ -170,7 +170,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known ILQFin advance and an upper +*> the exact value of NS is not known in advance and an upper *> bound must be used. *> \endverbatim *> @@ -294,8 +294,8 @@ CHARACTER JOBZ, RNGTGK LOGICAL ALLS, INDS, LQUERY, VALS, WANTU, WANTVT INTEGER I, ID, IE, IERR, ILQF, ILTGK, IQRF, ISCL, - $ ITAU, ITAUP, ITAUQ, ITEMP, ITGKZ, IUTGK, - $ J, K, MAXWRK, MINMN, MINWRK, MNTHR + $ ITAU, ITAUP, ITAUQ, ITEMP, ITEMPR, ITGKZ, + $ IUTGK, J, K, MAXWRK, MINMN, MINWRK, MNTHR REAL ABSTOL, ANRM, BIGNUM, EPS, SMLNUM * .. * .. Local Arrays .. @@ -367,8 +367,14 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN - INFO = -16 + ELSE IF( WANTVT ) THEN + IF( INDS ) THEN + IF( LDVT.LT.IU-IL+1 ) THEN + INFO = -17 + END IF + ELSE IF( LDVT.LT.MINMN ) THEN + INFO = -17 + END IF END IF END IF END IF @@ -390,18 +396,24 @@ * * Path 1 (M much larger than N) * - MAXWRK = N + N* - $ ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, N*N + N + 2*N* - $ ILAENV( 1, 'SGEBRD', ' ', N, N, -1, -1 ) ) - MINWRK = N*(N+4) + MINWRK = N*(N+5) + MAXWRK = N + N*ILAENV(1,'CGEQRF',' ',M,N,-1,-1) + MAXWRK = MAX(MAXWRK, + $ N*N+2*N+2*N*ILAENV(1,'CGEBRD',' ',N,N,-1,-1)) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ N*N+2*N+N*ILAENV(1,'CUNMQR','LN',N,N,N,-1)) + END IF ELSE * * Path 2 (M at least N, but not much larger) * - MAXWRK = 2*N + ( M+N )* - $ ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) - MINWRK = 2*N + M + MINWRK = 3*N + M + MAXWRK = 2*N + (M+N)*ILAENV(1,'CGEBRD',' ',M,N,-1,-1) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ 2*N+N*ILAENV(1,'CUNMQR','LN',N,N,N,-1)) + END IF END IF ELSE MNTHR = ILAENV( 6, 'CGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -409,18 +421,25 @@ * * Path 1t (N much larger than M) * - MAXWRK = M + M* - $ ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, M*M + M + 2*M* - $ ILAENV( 1, 'CGEBRD', ' ', M, M, -1, -1 ) ) - MINWRK = M*(M+4) + MINWRK = M*(M+5) + MAXWRK = M + M*ILAENV(1,'CGELQF',' ',M,N,-1,-1) + MAXWRK = MAX(MAXWRK, + $ M*M+2*M+2*M*ILAENV(1,'CGEBRD',' ',M,M,-1,-1)) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ M*M+2*M+M*ILAENV(1,'CUNMQR','LN',M,M,M,-1)) + END IF ELSE * * Path 2t (N greater than M, but not much larger) * - MAXWRK = M*(M*2+19) + ( M+N )* - $ ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) - MINWRK = 2*M + N +* + MINWRK = 3*M + N + MAXWRK = 2*M + (M+N)*ILAENV(1,'CGEBRD',' ',M,N,-1,-1) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ 2*M+M*ILAENV(1,'CUNMQR','LN',M,M,M,-1)) + END IF END IF END IF END IF @@ -518,14 +537,14 @@ CALL CGEBRD( N, N, WORK( IQRF ), N, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + N*(N*2+1) + ITEMPR = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL SBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -539,7 +558,7 @@ END DO K = K + N END DO - CALL CLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) + CALL CLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) * * Call CUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -594,14 +613,14 @@ CALL CGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + N*(N*2+1) + ITEMPR = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL SBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -615,7 +634,7 @@ END DO K = K + N END DO - CALL CLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) + CALL CLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) * * Call CUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -681,14 +700,14 @@ CALL CGEBRD( M, M, WORK( ILQF ), M, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + M*(M*2+1) + ITEMPR = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL SBDSVDX( 'U', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -722,7 +741,7 @@ END DO K = K + M END DO - CALL CLASET( 'A', M, N-M, CZERO, CZERO, + CALL CLASET( 'A', NS, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call CUNMBR to compute (VB**T)*(PB**T) @@ -758,14 +777,14 @@ CALL CGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + M*(M*2+1) + ITEMPR = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL SBDSVDX( 'L', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -799,7 +818,7 @@ END DO K = K + M END DO - CALL CLASET( 'A', M, N-M, CZERO, CZERO, + CALL CLASET( 'A', NS, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call CUNMBR to compute VB**T * PB**T diff --git a/lapack-netlib/SRC/cgetc2.f b/lapack-netlib/SRC/cgetc2.f index fac6b56820..99eb69d92 100644 --- a/lapack-netlib/SRC/cgetc2.f +++ b/lapack-netlib/SRC/cgetc2.f @@ -145,15 +145,33 @@ INTRINSIC ABS, CMPLX, MAX * .. * .. Executable Statements .. +* + INFO = 0 +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN * * Set constants to control overflow * - INFO = 0 EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) / EPS BIGNUM = ONE / SMLNUM CALL SLABAD( SMLNUM, BIGNUM ) * +* Handle the case N=1 by itself +* + IF( N.EQ.1 ) THEN + IPIV( 1 ) = 1 + JPIV( 1 ) = 1 + IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN + INFO = 1 + A( 1, 1 ) = CMPLX( SMLNUM, ZERO ) + END IF + RETURN + END IF +* * Factorize A using complete pivoting. * Set pivots less than SMIN to SMIN * diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f index 4a000fe10..decdae509 100644 --- a/lapack-netlib/SRC/cggev3.f +++ b/lapack-netlib/SRC/cggev3.f @@ -339,16 +339,16 @@ $ LDVL, VR, LDVR, WORK, -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) CALL CHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, - $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, - $ -1, WORK, IERR ) + $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, + $ RWORK, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) ELSE CALL CGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) CALL CHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, - $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, - $ -1, WORK, IERR ) + $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, + $ RWORK, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF WORK( 1 ) = CMPLX( LWKOPT ) diff --git a/lapack-netlib/SRC/dgeev.f b/lapack-netlib/SRC/dgeev.f index dd60db69e..328eaa39c 100644 --- a/lapack-netlib/SRC/dgeev.f +++ b/lapack-netlib/SRC/dgeev.f @@ -418,9 +418,9 @@ $ WORK( IWRK ), LWORK-IWRK+1, INFO ) END IF * -* If INFO > 0 from DHSEQR, then quit +* If INFO .NE. 0 from DHSEQR, then quit * - IF( INFO.GT.0 ) + IF( INFO.NE.0 ) $ GO TO 50 * IF( WANTVL .OR. WANTVR ) THEN diff --git a/lapack-netlib/SRC/dgesvdx.f b/lapack-netlib/SRC/dgesvdx.f index cfa2ff05d..4588083f8 100644 --- a/lapack-netlib/SRC/dgesvdx.f +++ b/lapack-netlib/SRC/dgesvdx.f @@ -169,7 +169,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known ILQFin advance and an upper +*> the exact value of NS is not known in advance and an upper *> bound must be used. *> \endverbatim *> @@ -357,8 +357,14 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN - INFO = -16 + ELSE IF( WANTVT ) THEN + IF( INDS ) THEN + IF( LDVT.LT.IU-IL+1 ) THEN + INFO = -17 + END IF + ELSE IF( LDVT.LT.MINMN ) THEN + INFO = -17 + END IF END IF END IF END IF @@ -380,18 +386,34 @@ * * Path 1 (M much larger than N) * - MAXWRK = N*(N*2+16) + + MAXWRK = N + $ N*ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, N*(N*2+20) + 2*N* + MAXWRK = MAX( MAXWRK, N*(N+5) + 2*N* $ ILAENV( 1, 'DGEBRD', ' ', N, N, -1, -1 ) ) - MINWRK = N*(N*2+21) + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* + $ ILAENV( 1, 'DORMQR', ' ', N, N, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* + $ ILAENV( 1, 'DORMLQ', ' ', N, N, -1, -1 ) ) + END IF + MINWRK = N*(N*3+20) ELSE * * Path 2 (M at least N, but not much larger) * - MAXWRK = N*(N*2+19) + ( M+N )* + MAXWRK = 4*N + ( M+N )* $ ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) - MINWRK = N*(N*2+20) + M + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* + $ ILAENV( 1, 'DORMQR', ' ', N, N, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* + $ ILAENV( 1, 'DORMLQ', ' ', N, N, -1, -1 ) ) + END IF + MINWRK = MAX(N*(N*2+19),4*N+M) END IF ELSE MNTHR = ILAENV( 6, 'DGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -399,18 +421,34 @@ * * Path 1t (N much larger than M) * - MAXWRK = M*(M*2+16) + + MAXWRK = M + $ M*ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, M*(M*2+20) + 2*M* + MAXWRK = MAX( MAXWRK, M*(M+5) + 2*M* $ ILAENV( 1, 'DGEBRD', ' ', M, M, -1, -1 ) ) - MINWRK = M*(M*2+21) + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* + $ ILAENV( 1, 'DORMQR', ' ', M, M, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* + $ ILAENV( 1, 'DORMLQ', ' ', M, M, -1, -1 ) ) + END IF + MINWRK = M*(M*3+20) ELSE * -* Path 2t (N greater than M, but not much larger) +* Path 2t (N at least M, but not much larger) * - MAXWRK = M*(M*2+19) + ( M+N )* + MAXWRK = 4*M + ( M+N )* $ ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) - MINWRK = M*(M*2+20) + N + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* + $ ILAENV( 1, 'DORMQR', ' ', M, M, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* + $ ILAENV( 1, 'DORMLQ', ' ', M, M, -1, -1 ) ) + END IF + MINWRK = MAX(M*(M*2+19),4*M+N) END IF END IF END IF @@ -522,7 +560,7 @@ CALL DCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL DLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL DLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call DORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -591,7 +629,7 @@ CALL DCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL DLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL DLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call DORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -687,7 +725,7 @@ CALL DCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL DLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) + CALL DLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) * * Call DORMBR to compute (VB**T)*(PB**T) * (Workspace in WORK( ITEMP ): need M, prefer M*NB) @@ -756,7 +794,7 @@ CALL DCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL DLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) + CALL DLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) * * Call DORMBR to compute VB**T * PB**T * (Workspace in WORK( ITEMP ): need M, prefer M*NB) diff --git a/lapack-netlib/SRC/dgetc2.f b/lapack-netlib/SRC/dgetc2.f index 7e43a0236..3cd7eeb2b 100644 --- a/lapack-netlib/SRC/dgetc2.f +++ b/lapack-netlib/SRC/dgetc2.f @@ -145,15 +145,33 @@ INTRINSIC ABS, MAX * .. * .. Executable Statements .. +* + INFO = 0 +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN * * Set constants to control overflow * - INFO = 0 EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) / EPS BIGNUM = ONE / SMLNUM CALL DLABAD( SMLNUM, BIGNUM ) * +* Handle the case N=1 by itself +* + IF( N.EQ.1 ) THEN + IPIV( 1 ) = 1 + JPIV( 1 ) = 1 + IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN + INFO = 1 + A( 1, 1 ) = SMLNUM + END IF + RETURN + END IF +* * Factorize A using complete pivoting. * Set pivots less than SMIN to SMIN. * diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f index 89dbe08c8..667de0afe 100644 --- a/lapack-netlib/SRC/sgeev.f +++ b/lapack-netlib/SRC/sgeev.f @@ -418,9 +418,9 @@ $ WORK( IWRK ), LWORK-IWRK+1, INFO ) END IF * -* If INFO > 0 from SHSEQR, then quit +* If INFO .NE. 0 from SHSEQR, then quit * - IF( INFO.GT.0 ) + IF( INFO.NE.0 ) $ GO TO 50 * IF( WANTVL .OR. WANTVR ) THEN diff --git a/lapack-netlib/SRC/sgesvdx.f b/lapack-netlib/SRC/sgesvdx.f index aae8b0764..9128a7c0a 100644 --- a/lapack-netlib/SRC/sgesvdx.f +++ b/lapack-netlib/SRC/sgesvdx.f @@ -169,7 +169,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known ILQFin advance and an upper +*> the exact value of NS is not known in advance and an upper *> bound must be used. *> \endverbatim *> @@ -357,8 +357,14 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN - INFO = -16 + ELSE IF( WANTVT ) THEN + IF( INDS ) THEN + IF( LDVT.LT.IU-IL+1 ) THEN + INFO = -17 + END IF + ELSE IF( LDVT.LT.MINMN ) THEN + INFO = -17 + END IF END IF END IF END IF @@ -380,18 +386,34 @@ * * Path 1 (M much larger than N) * - MAXWRK = N*(N*2+16) + + MAXWRK = N + $ N*ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, N*(N*2+20) + 2*N* + MAXWRK = MAX( MAXWRK, N*(N+5) + 2*N* $ ILAENV( 1, 'SGEBRD', ' ', N, N, -1, -1 ) ) - MINWRK = N*(N*2+21) + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* + $ ILAENV( 1, 'SORMQR', ' ', N, N, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* + $ ILAENV( 1, 'SORMLQ', ' ', N, N, -1, -1 ) ) + END IF + MINWRK = N*(N*3+20) ELSE * * Path 2 (M at least N, but not much larger) * - MAXWRK = N*(N*2+19) + ( M+N )* + MAXWRK = 4*N + ( M+N )* $ ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) - MINWRK = N*(N*2+20) + M + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* + $ ILAENV( 1, 'SORMQR', ' ', N, N, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* + $ ILAENV( 1, 'SORMLQ', ' ', N, N, -1, -1 ) ) + END IF + MINWRK = MAX(N*(N*2+19),4*N+M) END IF ELSE MNTHR = ILAENV( 6, 'SGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -399,18 +421,34 @@ * * Path 1t (N much larger than M) * - MAXWRK = M*(M*2+16) + + MAXWRK = M + $ M*ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, M*(M*2+20) + 2*M* + MAXWRK = MAX( MAXWRK, M*(M+5) + 2*M* $ ILAENV( 1, 'SGEBRD', ' ', M, M, -1, -1 ) ) - MINWRK = M*(M*2+21) + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* + $ ILAENV( 1, 'SORMQR', ' ', M, M, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* + $ ILAENV( 1, 'SORMLQ', ' ', M, M, -1, -1 ) ) + END IF + MINWRK = M*(M*3+20) ELSE * -* Path 2t (N greater than M, but not much larger) +* Path 2t (N at least M, but not much larger) * - MAXWRK = M*(M*2+19) + ( M+N )* + MAXWRK = 4*M + ( M+N )* $ ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) - MINWRK = M*(M*2+20) + N + IF (WANTU) THEN + MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* + $ ILAENV( 1, 'SORMQR', ' ', M, M, -1, -1 ) ) + END IF + IF (WANTVT) THEN + MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* + $ ILAENV( 1, 'SORMLQ', ' ', M, M, -1, -1 ) ) + END IF + MINWRK = MAX(M*(M*2+19),4*M+N) END IF END IF END IF @@ -522,7 +560,7 @@ CALL SCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL SLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL SLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call SORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -591,7 +629,7 @@ CALL SCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL SLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL SLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call SORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -687,7 +725,7 @@ CALL SCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL SLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) + CALL SLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) * * Call SORMBR to compute (VB**T)*(PB**T) * (Workspace in WORK( ITEMP ): need M, prefer M*NB) @@ -756,7 +794,7 @@ CALL SCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL SLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) + CALL SLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) * * Call SORMBR to compute VB**T * PB**T * (Workspace in WORK( ITEMP ): need M, prefer M*NB) diff --git a/lapack-netlib/SRC/sgetc2.f b/lapack-netlib/SRC/sgetc2.f index 3c3880d4e..598446519 100644 --- a/lapack-netlib/SRC/sgetc2.f +++ b/lapack-netlib/SRC/sgetc2.f @@ -145,15 +145,33 @@ INTRINSIC ABS, MAX * .. * .. Executable Statements .. +* + INFO = 0 +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN * * Set constants to control overflow * - INFO = 0 EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) / EPS BIGNUM = ONE / SMLNUM CALL SLABAD( SMLNUM, BIGNUM ) * +* Handle the case N=1 by itself +* + IF( N.EQ.1 ) THEN + IPIV( 1 ) = 1 + JPIV( 1 ) = 1 + IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN + INFO = 1 + A( 1, 1 ) = SMLNUM + END IF + RETURN + END IF +* * Factorize A using complete pivoting. * Set pivots less than SMIN to SMIN. * diff --git a/lapack-netlib/SRC/zgeev.f b/lapack-netlib/SRC/zgeev.f index d4520805f..a518b4cd9 100644 --- a/lapack-netlib/SRC/zgeev.f +++ b/lapack-netlib/SRC/zgeev.f @@ -404,9 +404,9 @@ $ WORK( IWRK ), LWORK-IWRK+1, INFO ) END IF * -* If INFO > 0 from ZHSEQR, then quit +* If INFO .NE. 0 from ZHSEQR, then quit * - IF( INFO.GT.0 ) + IF( INFO.NE.0 ) $ GO TO 50 * IF( WANTVL .OR. WANTVR ) THEN diff --git a/lapack-netlib/SRC/zgesvdx.f b/lapack-netlib/SRC/zgesvdx.f index 6f7d5ba04..c9509e458 100644 --- a/lapack-netlib/SRC/zgesvdx.f +++ b/lapack-netlib/SRC/zgesvdx.f @@ -36,27 +36,30 @@ * .. * * -* Purpose -* ======= -* -* ZGESVDX computes the singular value decomposition (SVD) of a complex -* M-by-N matrix A, optionally computing the left and/or right singular -* vectors. The SVD is written -* -* A = U * SIGMA * transpose(V) -* -* where SIGMA is an M-by-N matrix which is zero except for its -* min(m,n) diagonal elements, U is an M-by-M unitary matrix, and -* V is an N-by-N unitary matrix. The diagonal elements of SIGMA -* are the singular values of A; they are real and non-negative, and -* are returned in descending order. The first min(m,n) columns of -* U and V are the left and right singular vectors of A. -* -* ZGESVDX uses an eigenvalue problem for obtaining the SVD, which -* allows for the computation of a subset of singular values and -* vectors. See DBDSVDX for details. -* -* Note that the routine returns V**T, not V. +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGESVDX computes the singular value decomposition (SVD) of a complex +*> M-by-N matrix A, optionally computing the left and/or right singular +*> vectors. The SVD is written +*> +*> A = U * SIGMA * transpose(V) +*> +*> where SIGMA is an M-by-N matrix which is zero except for its +*> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and +*> V is an N-by-N unitary matrix. The diagonal elements of SIGMA +*> are the singular values of A; they are real and non-negative, and +*> are returned in descending order. The first min(m,n) columns of +*> U and V are the left and right singular vectors of A. +*> +*> ZGESVDX uses an eigenvalue problem for obtaining the SVD, which +*> allows for the computation of a subset of singular values and +*> vectors. See DBDSVDX for details. +*> +*> Note that the routine returns V**T, not V. +*> \endverbatim * * Arguments: * ========== @@ -107,7 +110,7 @@ *> *> \param[in,out] A *> \verbatim -*> A is COMPLEX array, dimension (LDA,N) +*> A is COMPLEX*16 array, dimension (LDA,N) *> On entry, the M-by-N matrix A. *> On exit, the contents of A are destroyed. *> \endverbatim @@ -167,7 +170,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known ILQFin advance and an upper +*> the exact value of NS is not known in advance and an upper *> bound must be used. *> \endverbatim *> @@ -291,8 +294,8 @@ CHARACTER JOBZ, RNGTGK LOGICAL ALLS, INDS, LQUERY, VALS, WANTU, WANTVT INTEGER I, ID, IE, IERR, ILQF, ILTGK, IQRF, ISCL, - $ ITAU, ITAUP, ITAUQ, ITEMP, ITGKZ, IUTGK, - $ J, K, MAXWRK, MINMN, MINWRK, MNTHR + $ ITAU, ITAUP, ITAUQ, ITEMP, ITEMPR, ITGKZ, + $ IUTGK, J, K, MAXWRK, MINMN, MINWRK, MNTHR DOUBLE PRECISION ABSTOL, ANRM, BIGNUM, EPS, SMLNUM * .. * .. Local Arrays .. @@ -364,8 +367,14 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN - INFO = -16 + ELSE IF( WANTVT ) THEN + IF( INDS ) THEN + IF( LDVT.LT.IU-IL+1 ) THEN + INFO = -17 + END IF + ELSE IF( LDVT.LT.MINMN ) THEN + INFO = -17 + END IF END IF END IF END IF @@ -387,18 +396,24 @@ * * Path 1 (M much larger than N) * - MAXWRK = N + N* - $ ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, N*N + N + 2*N* - $ ILAENV( 1, 'DGEBRD', ' ', N, N, -1, -1 ) ) - MINWRK = N*(N+4) + MINWRK = N*(N+5) + MAXWRK = N + N*ILAENV(1,'ZGEQRF',' ',M,N,-1,-1) + MAXWRK = MAX(MAXWRK, + $ N*N+2*N+2*N*ILAENV(1,'ZGEBRD',' ',N,N,-1,-1)) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ N*N+2*N+N*ILAENV(1,'ZUNMQR','LN',N,N,N,-1)) + END IF ELSE * * Path 2 (M at least N, but not much larger) * - MAXWRK = 2*N + ( M+N )* - $ ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) - MINWRK = 2*N + M + MINWRK = 3*N + M + MAXWRK = 2*N + (M+N)*ILAENV(1,'ZGEBRD',' ',M,N,-1,-1) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ 2*N+N*ILAENV(1,'ZUNMQR','LN',N,N,N,-1)) + END IF END IF ELSE MNTHR = ILAENV( 6, 'ZGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -406,18 +421,25 @@ * * Path 1t (N much larger than M) * - MAXWRK = M + M* - $ ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, M*M + M + 2*M* - $ ILAENV( 1, 'ZGEBRD', ' ', M, M, -1, -1 ) ) - MINWRK = M*(M+4) + MINWRK = M*(M+5) + MAXWRK = M + M*ILAENV(1,'ZGELQF',' ',M,N,-1,-1) + MAXWRK = MAX(MAXWRK, + $ M*M+2*M+2*M*ILAENV(1,'ZGEBRD',' ',M,M,-1,-1)) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ M*M+2*M+M*ILAENV(1,'ZUNMQR','LN',M,M,M,-1)) + END IF ELSE * * Path 2t (N greater than M, but not much larger) * - MAXWRK = M*(M*2+19) + ( M+N )* - $ ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) - MINWRK = 2*M + N +* + MINWRK = 3*M + N + MAXWRK = 2*M + (M+N)*ILAENV(1,'ZGEBRD',' ',M,N,-1,-1) + IF (WANTU .OR. WANTVT) THEN + MAXWRK = MAX(MAXWRK, + $ 2*M+M*ILAENV(1,'ZUNMQR','LN',M,M,M,-1)) + END IF END IF END IF END IF @@ -515,14 +537,14 @@ CALL ZGEBRD( N, N, WORK( IQRF ), N, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + N*(N*2+1) + ITEMPR = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL DBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -536,7 +558,7 @@ END DO K = K + N END DO - CALL ZLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) + CALL ZLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) * * Call ZUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -591,14 +613,14 @@ CALL ZGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + N*(N*2+1) + ITEMPR = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL DBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -612,7 +634,7 @@ END DO K = K + N END DO - CALL ZLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) + CALL ZLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) * * Call ZUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -678,14 +700,14 @@ CALL ZGEBRD( M, M, WORK( ILQF ), M, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + M*(M*2+1) + ITEMPR = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL DBDSVDX( 'U', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -719,7 +741,7 @@ END DO K = K + M END DO - CALL ZLASET( 'A', M, N-M, CZERO, CZERO, + CALL ZLASET( 'A', NS, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call ZUNMBR to compute (VB**T)*(PB**T) @@ -755,14 +777,14 @@ CALL ZGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMP = ITGKZ + M*(M*2+1) + ITEMPR = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL DBDSVDX( 'L', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -796,7 +818,7 @@ END DO K = K + M END DO - CALL ZLASET( 'A', M, N-M, CZERO, CZERO, + CALL ZLASET( 'A', NS, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call ZUNMBR to compute VB**T * PB**T diff --git a/lapack-netlib/SRC/zgetc2.f b/lapack-netlib/SRC/zgetc2.f index 3179612f5..bf59415b5 100644 --- a/lapack-netlib/SRC/zgetc2.f +++ b/lapack-netlib/SRC/zgetc2.f @@ -145,15 +145,33 @@ INTRINSIC ABS, DCMPLX, MAX * .. * .. Executable Statements .. +* + INFO = 0 +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN * * Set constants to control overflow * - INFO = 0 EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) / EPS BIGNUM = ONE / SMLNUM CALL DLABAD( SMLNUM, BIGNUM ) * +* Handle the case N=1 by itself +* + IF( N.EQ.1 ) THEN + IPIV( 1 ) = 1 + JPIV( 1 ) = 1 + IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN + INFO = 1 + A( 1, 1 ) = DCMPLX( SMLNUM, ZERO ) + END IF + RETURN + END IF +* * Factorize A using complete pivoting. * Set pivots less than SMIN to SMIN * diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f index 1c4e832af..78337fd07 100644 --- a/lapack-netlib/SRC/zggev3.f +++ b/lapack-netlib/SRC/zggev3.f @@ -340,7 +340,7 @@ LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) CALL ZHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, - $ WORK, IERR ) + $ RWORK, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) ELSE CALL ZGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, @@ -348,7 +348,7 @@ LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) CALL ZHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, - $ WORK, IERR ) + $ RWORK, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF WORK( 1 ) = DCMPLX( LWKOPT ) From 10c2ebdfc5c60f495c5c34387d9a3c3fbadee843 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 7 Mar 2016 10:34:04 +0100 Subject: [PATCH 26/37] BUGFIX: removed fixes for bugs #148 and #149, because info for xerbla is wrong --- lapack-netlib/SRC/cgesvdx.f | 85 +++++++++-------------- lapack-netlib/SRC/dgesvdx.f | 74 +++++--------------- lapack-netlib/SRC/sgesvdx.f | 74 +++++--------------- lapack-netlib/SRC/zgesvdx.f | 132 +++++++++++++++--------------------- 4 files changed, 124 insertions(+), 241 deletions(-) diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f index 87ea9861d..235426ad4 100644 --- a/lapack-netlib/SRC/cgesvdx.f +++ b/lapack-netlib/SRC/cgesvdx.f @@ -170,7 +170,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known in advance and an upper +*> the exact value of NS is not known ILQFin advance and an upper *> bound must be used. *> \endverbatim *> @@ -294,8 +294,8 @@ CHARACTER JOBZ, RNGTGK LOGICAL ALLS, INDS, LQUERY, VALS, WANTU, WANTVT INTEGER I, ID, IE, IERR, ILQF, ILTGK, IQRF, ISCL, - $ ITAU, ITAUP, ITAUQ, ITEMP, ITEMPR, ITGKZ, - $ IUTGK, J, K, MAXWRK, MINMN, MINWRK, MNTHR + $ ITAU, ITAUP, ITAUQ, ITEMP, ITGKZ, IUTGK, + $ J, K, MAXWRK, MINMN, MINWRK, MNTHR REAL ABSTOL, ANRM, BIGNUM, EPS, SMLNUM * .. * .. Local Arrays .. @@ -367,14 +367,8 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT ) THEN - IF( INDS ) THEN - IF( LDVT.LT.IU-IL+1 ) THEN - INFO = -17 - END IF - ELSE IF( LDVT.LT.MINMN ) THEN - INFO = -17 - END IF + ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN + INFO = -16 END IF END IF END IF @@ -396,24 +390,18 @@ * * Path 1 (M much larger than N) * - MINWRK = N*(N+5) - MAXWRK = N + N*ILAENV(1,'CGEQRF',' ',M,N,-1,-1) - MAXWRK = MAX(MAXWRK, - $ N*N+2*N+2*N*ILAENV(1,'CGEBRD',' ',N,N,-1,-1)) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ N*N+2*N+N*ILAENV(1,'CUNMQR','LN',N,N,N,-1)) - END IF + MAXWRK = N + N* + $ ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) + MAXWRK = MAX( MAXWRK, N*N + N + 2*N* + $ ILAENV( 1, 'SGEBRD', ' ', N, N, -1, -1 ) ) + MINWRK = N*(N+4) ELSE * * Path 2 (M at least N, but not much larger) * - MINWRK = 3*N + M - MAXWRK = 2*N + (M+N)*ILAENV(1,'CGEBRD',' ',M,N,-1,-1) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ 2*N+N*ILAENV(1,'CUNMQR','LN',N,N,N,-1)) - END IF + MAXWRK = 2*N + ( M+N )* + $ ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) + MINWRK = 2*N + M END IF ELSE MNTHR = ILAENV( 6, 'CGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -421,25 +409,18 @@ * * Path 1t (N much larger than M) * - MINWRK = M*(M+5) - MAXWRK = M + M*ILAENV(1,'CGELQF',' ',M,N,-1,-1) - MAXWRK = MAX(MAXWRK, - $ M*M+2*M+2*M*ILAENV(1,'CGEBRD',' ',M,M,-1,-1)) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ M*M+2*M+M*ILAENV(1,'CUNMQR','LN',M,M,M,-1)) - END IF + MAXWRK = M + M* + $ ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) + MAXWRK = MAX( MAXWRK, M*M + M + 2*M* + $ ILAENV( 1, 'CGEBRD', ' ', M, M, -1, -1 ) ) + MINWRK = M*(M+4) ELSE * * Path 2t (N greater than M, but not much larger) * -* - MINWRK = 3*M + N - MAXWRK = 2*M + (M+N)*ILAENV(1,'CGEBRD',' ',M,N,-1,-1) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ 2*M+M*ILAENV(1,'CUNMQR','LN',M,M,M,-1)) - END IF + MAXWRK = M*(M*2+19) + ( M+N )* + $ ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) + MINWRK = 2*M + N END IF END IF END IF @@ -537,14 +518,14 @@ CALL CGEBRD( N, N, WORK( IQRF ), N, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + N*(N*2+1) + ITEMP = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL SBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -558,7 +539,7 @@ END DO K = K + N END DO - CALL CLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) + CALL CLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) * * Call CUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -613,14 +594,14 @@ CALL CGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + N*(N*2+1) + ITEMP = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL SBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -634,7 +615,7 @@ END DO K = K + N END DO - CALL CLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) + CALL CLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) * * Call CUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -700,14 +681,14 @@ CALL CGEBRD( M, M, WORK( ILQF ), M, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + M*(M*2+1) + ITEMP = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL SBDSVDX( 'U', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -741,7 +722,7 @@ END DO K = K + M END DO - CALL CLASET( 'A', NS, N-M, CZERO, CZERO, + CALL CLASET( 'A', M, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call CUNMBR to compute (VB**T)*(PB**T) @@ -777,14 +758,14 @@ CALL CGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + M*(M*2+1) + ITEMP = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL SBDSVDX( 'L', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -818,7 +799,7 @@ END DO K = K + M END DO - CALL CLASET( 'A', NS, N-M, CZERO, CZERO, + CALL CLASET( 'A', M, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call CUNMBR to compute VB**T * PB**T diff --git a/lapack-netlib/SRC/dgesvdx.f b/lapack-netlib/SRC/dgesvdx.f index 4588083f8..cfa2ff05d 100644 --- a/lapack-netlib/SRC/dgesvdx.f +++ b/lapack-netlib/SRC/dgesvdx.f @@ -169,7 +169,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known in advance and an upper +*> the exact value of NS is not known ILQFin advance and an upper *> bound must be used. *> \endverbatim *> @@ -357,14 +357,8 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT ) THEN - IF( INDS ) THEN - IF( LDVT.LT.IU-IL+1 ) THEN - INFO = -17 - END IF - ELSE IF( LDVT.LT.MINMN ) THEN - INFO = -17 - END IF + ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN + INFO = -16 END IF END IF END IF @@ -386,34 +380,18 @@ * * Path 1 (M much larger than N) * - MAXWRK = N + + MAXWRK = N*(N*2+16) + $ N*ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, N*(N+5) + 2*N* + MAXWRK = MAX( MAXWRK, N*(N*2+20) + 2*N* $ ILAENV( 1, 'DGEBRD', ' ', N, N, -1, -1 ) ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* - $ ILAENV( 1, 'DORMQR', ' ', N, N, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* - $ ILAENV( 1, 'DORMLQ', ' ', N, N, -1, -1 ) ) - END IF - MINWRK = N*(N*3+20) + MINWRK = N*(N*2+21) ELSE * * Path 2 (M at least N, but not much larger) * - MAXWRK = 4*N + ( M+N )* + MAXWRK = N*(N*2+19) + ( M+N )* $ ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* - $ ILAENV( 1, 'DORMQR', ' ', N, N, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* - $ ILAENV( 1, 'DORMLQ', ' ', N, N, -1, -1 ) ) - END IF - MINWRK = MAX(N*(N*2+19),4*N+M) + MINWRK = N*(N*2+20) + M END IF ELSE MNTHR = ILAENV( 6, 'DGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -421,34 +399,18 @@ * * Path 1t (N much larger than M) * - MAXWRK = M + + MAXWRK = M*(M*2+16) + $ M*ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, M*(M+5) + 2*M* + MAXWRK = MAX( MAXWRK, M*(M*2+20) + 2*M* $ ILAENV( 1, 'DGEBRD', ' ', M, M, -1, -1 ) ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* - $ ILAENV( 1, 'DORMQR', ' ', M, M, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* - $ ILAENV( 1, 'DORMLQ', ' ', M, M, -1, -1 ) ) - END IF - MINWRK = M*(M*3+20) + MINWRK = M*(M*2+21) ELSE * -* Path 2t (N at least M, but not much larger) +* Path 2t (N greater than M, but not much larger) * - MAXWRK = 4*M + ( M+N )* + MAXWRK = M*(M*2+19) + ( M+N )* $ ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* - $ ILAENV( 1, 'DORMQR', ' ', M, M, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* - $ ILAENV( 1, 'DORMLQ', ' ', M, M, -1, -1 ) ) - END IF - MINWRK = MAX(M*(M*2+19),4*M+N) + MINWRK = M*(M*2+20) + N END IF END IF END IF @@ -560,7 +522,7 @@ CALL DCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL DLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL DLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call DORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -629,7 +591,7 @@ CALL DCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL DLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL DLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call DORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -725,7 +687,7 @@ CALL DCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL DLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) + CALL DLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) * * Call DORMBR to compute (VB**T)*(PB**T) * (Workspace in WORK( ITEMP ): need M, prefer M*NB) @@ -794,7 +756,7 @@ CALL DCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL DLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) + CALL DLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) * * Call DORMBR to compute VB**T * PB**T * (Workspace in WORK( ITEMP ): need M, prefer M*NB) diff --git a/lapack-netlib/SRC/sgesvdx.f b/lapack-netlib/SRC/sgesvdx.f index 9128a7c0a..aae8b0764 100644 --- a/lapack-netlib/SRC/sgesvdx.f +++ b/lapack-netlib/SRC/sgesvdx.f @@ -169,7 +169,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known in advance and an upper +*> the exact value of NS is not known ILQFin advance and an upper *> bound must be used. *> \endverbatim *> @@ -357,14 +357,8 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT ) THEN - IF( INDS ) THEN - IF( LDVT.LT.IU-IL+1 ) THEN - INFO = -17 - END IF - ELSE IF( LDVT.LT.MINMN ) THEN - INFO = -17 - END IF + ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN + INFO = -16 END IF END IF END IF @@ -386,34 +380,18 @@ * * Path 1 (M much larger than N) * - MAXWRK = N + + MAXWRK = N*(N*2+16) + $ N*ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, N*(N+5) + 2*N* + MAXWRK = MAX( MAXWRK, N*(N*2+20) + 2*N* $ ILAENV( 1, 'SGEBRD', ' ', N, N, -1, -1 ) ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* - $ ILAENV( 1, 'SORMQR', ' ', N, N, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,N*(N*3+6)+N* - $ ILAENV( 1, 'SORMLQ', ' ', N, N, -1, -1 ) ) - END IF - MINWRK = N*(N*3+20) + MINWRK = N*(N*2+21) ELSE * * Path 2 (M at least N, but not much larger) * - MAXWRK = 4*N + ( M+N )* + MAXWRK = N*(N*2+19) + ( M+N )* $ ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* - $ ILAENV( 1, 'SORMQR', ' ', N, N, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,N*(N*2+5)+N* - $ ILAENV( 1, 'SORMLQ', ' ', N, N, -1, -1 ) ) - END IF - MINWRK = MAX(N*(N*2+19),4*N+M) + MINWRK = N*(N*2+20) + M END IF ELSE MNTHR = ILAENV( 6, 'SGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -421,34 +399,18 @@ * * Path 1t (N much larger than M) * - MAXWRK = M + + MAXWRK = M*(M*2+16) + $ M*ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) - MAXWRK = MAX( MAXWRK, M*(M+5) + 2*M* + MAXWRK = MAX( MAXWRK, M*(M*2+20) + 2*M* $ ILAENV( 1, 'SGEBRD', ' ', M, M, -1, -1 ) ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* - $ ILAENV( 1, 'SORMQR', ' ', M, M, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,M*(M*3+6)+M* - $ ILAENV( 1, 'SORMLQ', ' ', M, M, -1, -1 ) ) - END IF - MINWRK = M*(M*3+20) + MINWRK = M*(M*2+21) ELSE * -* Path 2t (N at least M, but not much larger) +* Path 2t (N greater than M, but not much larger) * - MAXWRK = 4*M + ( M+N )* + MAXWRK = M*(M*2+19) + ( M+N )* $ ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) - IF (WANTU) THEN - MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* - $ ILAENV( 1, 'SORMQR', ' ', M, M, -1, -1 ) ) - END IF - IF (WANTVT) THEN - MAXWRK = MAX(MAXWRK,M*(M*2+5)+M* - $ ILAENV( 1, 'SORMLQ', ' ', M, M, -1, -1 ) ) - END IF - MINWRK = MAX(M*(M*2+19),4*M+N) + MINWRK = M*(M*2+20) + N END IF END IF END IF @@ -560,7 +522,7 @@ CALL SCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL SLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL SLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call SORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -629,7 +591,7 @@ CALL SCOPY( N, WORK( J ), 1, U( 1,I ), 1 ) J = J + N*2 END DO - CALL SLASET( 'A', M-N, NS, ZERO, ZERO, U( N+1,1 ), LDU ) + CALL SLASET( 'A', M-N, N, ZERO, ZERO, U( N+1,1 ), LDU ) * * Call SORMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -725,7 +687,7 @@ CALL SCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL SLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) + CALL SLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) * * Call SORMBR to compute (VB**T)*(PB**T) * (Workspace in WORK( ITEMP ): need M, prefer M*NB) @@ -794,7 +756,7 @@ CALL SCOPY( M, WORK( J ), 1, VT( I,1 ), LDVT ) J = J + M*2 END DO - CALL SLASET( 'A', NS, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT) + CALL SLASET( 'A', M, N-M, ZERO, ZERO, VT( 1,M+1 ), LDVT ) * * Call SORMBR to compute VB**T * PB**T * (Workspace in WORK( ITEMP ): need M, prefer M*NB) diff --git a/lapack-netlib/SRC/zgesvdx.f b/lapack-netlib/SRC/zgesvdx.f index c9509e458..6f7d5ba04 100644 --- a/lapack-netlib/SRC/zgesvdx.f +++ b/lapack-netlib/SRC/zgesvdx.f @@ -36,30 +36,27 @@ * .. * * -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> ZGESVDX computes the singular value decomposition (SVD) of a complex -*> M-by-N matrix A, optionally computing the left and/or right singular -*> vectors. The SVD is written -*> -*> A = U * SIGMA * transpose(V) -*> -*> where SIGMA is an M-by-N matrix which is zero except for its -*> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and -*> V is an N-by-N unitary matrix. The diagonal elements of SIGMA -*> are the singular values of A; they are real and non-negative, and -*> are returned in descending order. The first min(m,n) columns of -*> U and V are the left and right singular vectors of A. -*> -*> ZGESVDX uses an eigenvalue problem for obtaining the SVD, which -*> allows for the computation of a subset of singular values and -*> vectors. See DBDSVDX for details. -*> -*> Note that the routine returns V**T, not V. -*> \endverbatim +* Purpose +* ======= +* +* ZGESVDX computes the singular value decomposition (SVD) of a complex +* M-by-N matrix A, optionally computing the left and/or right singular +* vectors. The SVD is written +* +* A = U * SIGMA * transpose(V) +* +* where SIGMA is an M-by-N matrix which is zero except for its +* min(m,n) diagonal elements, U is an M-by-M unitary matrix, and +* V is an N-by-N unitary matrix. The diagonal elements of SIGMA +* are the singular values of A; they are real and non-negative, and +* are returned in descending order. The first min(m,n) columns of +* U and V are the left and right singular vectors of A. +* +* ZGESVDX uses an eigenvalue problem for obtaining the SVD, which +* allows for the computation of a subset of singular values and +* vectors. See DBDSVDX for details. +* +* Note that the routine returns V**T, not V. * * Arguments: * ========== @@ -110,7 +107,7 @@ *> *> \param[in,out] A *> \verbatim -*> A is COMPLEX*16 array, dimension (LDA,N) +*> A is COMPLEX array, dimension (LDA,N) *> On entry, the M-by-N matrix A. *> On exit, the contents of A are destroyed. *> \endverbatim @@ -170,7 +167,7 @@ *> vectors, stored columnwise) as specified by RANGE; if *> JOBU = 'N', U is not referenced. *> Note: The user must ensure that UCOL >= NS; if RANGE = 'V', -*> the exact value of NS is not known in advance and an upper +*> the exact value of NS is not known ILQFin advance and an upper *> bound must be used. *> \endverbatim *> @@ -294,8 +291,8 @@ CHARACTER JOBZ, RNGTGK LOGICAL ALLS, INDS, LQUERY, VALS, WANTU, WANTVT INTEGER I, ID, IE, IERR, ILQF, ILTGK, IQRF, ISCL, - $ ITAU, ITAUP, ITAUQ, ITEMP, ITEMPR, ITGKZ, - $ IUTGK, J, K, MAXWRK, MINMN, MINWRK, MNTHR + $ ITAU, ITAUP, ITAUQ, ITEMP, ITGKZ, IUTGK, + $ J, K, MAXWRK, MINMN, MINWRK, MNTHR DOUBLE PRECISION ABSTOL, ANRM, BIGNUM, EPS, SMLNUM * .. * .. Local Arrays .. @@ -367,14 +364,8 @@ IF( INFO.EQ.0 ) THEN IF( WANTU .AND. LDU.LT.M ) THEN INFO = -15 - ELSE IF( WANTVT ) THEN - IF( INDS ) THEN - IF( LDVT.LT.IU-IL+1 ) THEN - INFO = -17 - END IF - ELSE IF( LDVT.LT.MINMN ) THEN - INFO = -17 - END IF + ELSE IF( WANTVT .AND. LDVT.LT.MINMN ) THEN + INFO = -16 END IF END IF END IF @@ -396,24 +387,18 @@ * * Path 1 (M much larger than N) * - MINWRK = N*(N+5) - MAXWRK = N + N*ILAENV(1,'ZGEQRF',' ',M,N,-1,-1) - MAXWRK = MAX(MAXWRK, - $ N*N+2*N+2*N*ILAENV(1,'ZGEBRD',' ',N,N,-1,-1)) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ N*N+2*N+N*ILAENV(1,'ZUNMQR','LN',N,N,N,-1)) - END IF + MAXWRK = N + N* + $ ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) + MAXWRK = MAX( MAXWRK, N*N + N + 2*N* + $ ILAENV( 1, 'DGEBRD', ' ', N, N, -1, -1 ) ) + MINWRK = N*(N+4) ELSE * * Path 2 (M at least N, but not much larger) * - MINWRK = 3*N + M - MAXWRK = 2*N + (M+N)*ILAENV(1,'ZGEBRD',' ',M,N,-1,-1) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ 2*N+N*ILAENV(1,'ZUNMQR','LN',N,N,N,-1)) - END IF + MAXWRK = 2*N + ( M+N )* + $ ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) + MINWRK = 2*N + M END IF ELSE MNTHR = ILAENV( 6, 'ZGESVD', JOBU // JOBVT, M, N, 0, 0 ) @@ -421,25 +406,18 @@ * * Path 1t (N much larger than M) * - MINWRK = M*(M+5) - MAXWRK = M + M*ILAENV(1,'ZGELQF',' ',M,N,-1,-1) - MAXWRK = MAX(MAXWRK, - $ M*M+2*M+2*M*ILAENV(1,'ZGEBRD',' ',M,M,-1,-1)) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ M*M+2*M+M*ILAENV(1,'ZUNMQR','LN',M,M,M,-1)) - END IF + MAXWRK = M + M* + $ ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 ) + MAXWRK = MAX( MAXWRK, M*M + M + 2*M* + $ ILAENV( 1, 'ZGEBRD', ' ', M, M, -1, -1 ) ) + MINWRK = M*(M+4) ELSE * * Path 2t (N greater than M, but not much larger) * -* - MINWRK = 3*M + N - MAXWRK = 2*M + (M+N)*ILAENV(1,'ZGEBRD',' ',M,N,-1,-1) - IF (WANTU .OR. WANTVT) THEN - MAXWRK = MAX(MAXWRK, - $ 2*M+M*ILAENV(1,'ZUNMQR','LN',M,M,M,-1)) - END IF + MAXWRK = M*(M*2+19) + ( M+N )* + $ ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) + MINWRK = 2*M + N END IF END IF END IF @@ -537,14 +515,14 @@ CALL ZGEBRD( N, N, WORK( IQRF ), N, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + N*(N*2+1) + ITEMP = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL DBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -558,7 +536,7 @@ END DO K = K + N END DO - CALL ZLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) + CALL ZLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) * * Call ZUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -613,14 +591,14 @@ CALL ZGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + N*(N*2+1) + ITEMP = ITGKZ + N*(N*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*N*N+14*N) * CALL DBDSVDX( 'U', JOBZ, RNGTGK, N, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), N*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), N*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -634,7 +612,7 @@ END DO K = K + N END DO - CALL ZLASET( 'A', M-N, NS, CZERO, CZERO, U( N+1,1 ), LDU) + CALL ZLASET( 'A', M-N, N, CZERO, CZERO, U( N+1,1 ), LDU ) * * Call ZUNMBR to compute QB*UB. * (Workspace in WORK( ITEMP ): need N, prefer N*NB) @@ -700,14 +678,14 @@ CALL ZGEBRD( M, M, WORK( ILQF ), M, RWORK( ID ), $ RWORK( IE ), WORK( ITAUQ ), WORK( ITAUP ), $ WORK( ITEMP ), LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + M*(M*2+1) + ITEMP = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL DBDSVDX( 'U', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -741,7 +719,7 @@ END DO K = K + M END DO - CALL ZLASET( 'A', NS, N-M, CZERO, CZERO, + CALL ZLASET( 'A', M, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call ZUNMBR to compute (VB**T)*(PB**T) @@ -777,14 +755,14 @@ CALL ZGEBRD( M, N, A, LDA, RWORK( ID ), RWORK( IE ), $ WORK( ITAUQ ), WORK( ITAUP ), WORK( ITEMP ), $ LWORK-ITEMP+1, INFO ) - ITEMPR = ITGKZ + M*(M*2+1) + ITEMP = ITGKZ + M*(M*2+1) * * Solve eigenvalue problem TGK*Z=Z*S. * (Workspace: need 2*M*M+14*M) * CALL DBDSVDX( 'L', JOBZ, RNGTGK, M, RWORK( ID ), $ RWORK( IE ), VL, VU, ILTGK, IUTGK, NS, S, - $ RWORK( ITGKZ ), M*2, RWORK( ITEMPR ), + $ RWORK( ITGKZ ), M*2, RWORK( ITEMP ), $ IWORK, INFO) * * If needed, compute left singular vectors. @@ -818,7 +796,7 @@ END DO K = K + M END DO - CALL ZLASET( 'A', NS, N-M, CZERO, CZERO, + CALL ZLASET( 'A', M, N-M, CZERO, CZERO, $ VT( 1,M+1 ), LDVT ) * * Call ZUNMBR to compute VB**T * PB**T From c99cc41cbdab147f5700cd654abd45662e6796f8 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 9 Mar 2016 14:02:03 +0100 Subject: [PATCH 27/37] Added optimized zgemv_n kernel for bulldozer, piledriver and steamroller --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/KERNEL.STEAMROLLER | 2 +- kernel/x86_64/zgemv_n_4.c | 3 +- kernel/x86_64/zgemv_n_microk_bulldozer-4.c | 514 +++++++++++++++++++++ 4 files changed, 518 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/zgemv_n_microk_bulldozer-4.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 6c726a6e9..3ad142063 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -11,7 +11,7 @@ ZAXPYKERNEL = zaxpy.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index 5291cc624..f14c82303 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 5ace6123b..63e49f2af 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -34,9 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "zgemv_n_microk_bulldozer-4.c" #endif - #define NBMAX 1024 #ifndef HAVE_KERNEL_4x4 diff --git a/kernel/x86_64/zgemv_n_microk_bulldozer-4.c b/kernel/x86_64/zgemv_n_microk_bulldozer-4.c new file mode 100644 index 000000000..f367ad607 --- /dev/null +++ b/kernel/x86_64/zgemv_n_microk_bulldozer-4.c @@ -0,0 +1,514 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + if ( n > 384 ) + { + + __asm__ __volatile__ + ( + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 + + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%4,%0,8) \n\t" + + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 + "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 + + "prefetcht0 512(%5,%0,8) \n\t" + + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "prefetcht0 512(%6,%0,8) \n\t" + + "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 + "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 + + "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 + "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 + + "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 512(%7,%0,8) \n\t" + + "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + } + else + { + + __asm__ __volatile__ + ( + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 + + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 + "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 + + + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + + "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 + "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 + + "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 + "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 + + "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + + "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + } + + +} + +#define HAVE_KERNEL_4x2 1 +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + + + // ".align 16 \n\t" + "1: \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 + "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + + // ".align 16 \n\t" + "1: \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_ADDY 1 + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i Date: Wed, 9 Mar 2016 15:48:29 +0100 Subject: [PATCH 28/37] modified common.h for piledriver --- common.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common.h b/common.h index 6b65c37d1..e045e42b2 100644 --- a/common.h +++ b/common.h @@ -332,12 +332,13 @@ typedef int blasint; #endif #endif - +/* #ifdef PILEDRIVER #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif #endif +*/ /* #ifdef STEAMROLLER From 05196a8497091fd0ed7a3d45d0bd105fc3199bbb Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 9 Mar 2016 12:50:07 -0500 Subject: [PATCH 29/37] Refs #716. Only call getenv at init function. --- driver/others/CMakeLists.txt | 1 + driver/others/Makefile | 2 +- driver/others/blas_server.c | 25 +++----- driver/others/memory.c | 16 +++-- driver/others/openblas_env.c | 84 +++++++++++++++++++++++++++ driver/others/openblas_error_handle.c | 8 +-- driver/others/parameter.c | 7 +-- 7 files changed, 110 insertions(+), 33 deletions(-) create mode 100644 driver/others/openblas_env.c diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index b2af55e36..b361f2a97 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -33,6 +33,7 @@ set(COMMON_SOURCES xerbla.c openblas_set_num_threads.c openblas_error_handle.c + openblas_env.c openblas_get_num_procs.c openblas_get_num_threads.c ) diff --git a/driver/others/Makefile b/driver/others/Makefile index ed145cee8..c41b24e1f 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c3bf80173..42cadf4b5 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +extern unsigned int openblas_thread_timeout(); + #ifdef SMP_SERVER #undef MONITOR @@ -524,6 +526,7 @@ static int blas_monitor(void *arg){ int blas_thread_init(void){ BLASLONG i; int ret; + int thread_timeout_env; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif @@ -540,22 +543,12 @@ int blas_thread_init(void){ if (!blas_server_avail){ - env_var_t p; - - if (readenv(p,"THREAD_TIMEOUT")) { - thread_timeout = atoi(p); - if (thread_timeout < 4) thread_timeout = 4; - if (thread_timeout > 30) thread_timeout = 30; - thread_timeout = (1 << thread_timeout); - }else{ - if (readenv(p,"GOTO_THREAD_TIMEOUT")) { - thread_timeout = atoi(p); - if (thread_timeout < 4) thread_timeout = 4; - if (thread_timeout > 30) thread_timeout = 30; - thread_timeout = (1 << thread_timeout); - } - } - + thread_timeout_env=openblas_thread_timeout(); + if (thread_timeout_env>0) { + if (thread_timeout_env < 4) thread_timeout_env = 4; + if (thread_timeout_env > 30) thread_timeout_env = 30; + thread_timeout = (1 << thread_timeout_env); + } for(i = 0; i < blas_num_threads - 1; i++){ diff --git a/driver/others/memory.c b/driver/others/memory.c index e0761d784..e64781740 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -294,8 +294,11 @@ void openblas_fork_handler() #endif } +extern int openblas_num_threads_env(); +extern int openblas_goto_num_threads_env(); +extern int openblas_omp_num_threads_env(); + int blas_get_cpu_number(void){ - env_var_t p; #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif @@ -310,18 +313,18 @@ int blas_get_cpu_number(void){ blas_goto_num = 0; #ifndef USE_OPENMP - if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p); + blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; if (blas_goto_num == 0) { - if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p); - if (blas_goto_num < 0) blas_goto_num = 0; + blas_goto_num=openblas_goto_num_threads_env(); + if (blas_goto_num < 0) blas_goto_num = 0; } #endif blas_omp_num = 0; - if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p); + blas_omp_num=openblas_omp_num_threads_env(); if (blas_omp_num < 0) blas_omp_num = 0; if (blas_goto_num > 0) blas_num_threads = blas_goto_num; @@ -1340,6 +1343,7 @@ static void gotoblas_memory_init(void) { /* Initialization for all function; this function should be called before main */ static int gotoblas_initialized = 0; +extern void openblas_read_env(); void CONSTRUCTOR gotoblas_init(void) { @@ -1349,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) { openblas_fork_handler(); #endif + openblas_read_env(); + #ifdef PROFILE moncontrol (0); #endif diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c new file mode 100644 index 000000000..64ece9515 --- /dev/null +++ b/driver/others/openblas_env.c @@ -0,0 +1,84 @@ +/*************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*****************************************************************************/ + +#include "common.h" + +static int openblas_env_verbose=0; +static unsigned int openblas_env_thread_timeout=0; +static int openblas_env_block_factor=0; +static int openblas_env_openblas_num_threads=0; +static int openblas_env_goto_num_threads=0; +static int openblas_env_omp_num_threads=0; + +int openblas_verbose() { return openblas_env_verbose;} +unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} +int openblas_block_factor() { return openblas_env_block_factor;} +int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} +int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} +int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} + +void openblas_read_env() { + int ret=0; + env_var_t p; + if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_verbose=ret; + + ret=0; + if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_block_factor=ret; + + ret=0; + if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_thread_timeout=(unsigned int)ret; + + ret=0; + if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_openblas_num_threads=ret; + + ret=0; + if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_goto_num_threads=ret; + + ret=0; + if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_omp_num_threads=ret; + +} + + diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c index f32a54452..9ac72c15d 100644 --- a/driver/others/openblas_error_handle.c +++ b/driver/others/openblas_error_handle.c @@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int openblas_verbose() { - int ret=0; - env_var_t p; - if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); - if(ret<0) ret=0; - return ret; -} +extern int openblas_verbose(); void openblas_warning(int verbose, const char * msg) { int current_verbose; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index d741f2fb9..f4b1a80ad 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -40,6 +40,7 @@ #include #include "common.h" +extern int openblas_block_factor(); int get_L2_size(void); #define DEFAULT_GEMM_P 128 @@ -249,7 +250,6 @@ int get_L2_size(void){ void blas_set_parameter(void){ - env_var_t p; int factor; #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) int size = 16; @@ -468,9 +468,8 @@ void blas_set_parameter(void){ #endif #endif - - if (readenv(p,"GOTO_BLOCK_FACTOR")) { - factor = atoi(p); + factor=openblas_block_factor(); + if (factor>0) { if (factor < 10) factor = 10; if (factor > 200) factor = 200; From 68eb4fa329964527987db43ed28cad46a9a98368 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 9 Mar 2016 14:52:47 -0500 Subject: [PATCH 30/37] Add missing openblas_env makefile. --- driver/others/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/Makefile b/driver/others/Makefile index c41b24e1f..e61ba7bc8 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c openblas_error_handle.$(SUFFIX) : openblas_error_handle.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_env.$(SUFFIX) : openblas_env.c + $(CC) $(CFLAGS) -c $< -o $(@F) + blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) From fdf291be30dfe3312cc7e7039a770a9e30f4c515 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 10 Mar 2016 09:42:07 +0100 Subject: [PATCH 31/37] Added optimized cgemv_n and cgemv_t kernels for bulldozer, piledriver and steamroller --- kernel/x86_64/cgemv_n_microk_bulldozer-4.c | 541 +++++++++++++++++++++ kernel/x86_64/cgemv_t_microk_bulldozer-4.c | 541 +++++++++++++++++++++ 2 files changed, 1082 insertions(+) create mode 100644 kernel/x86_64/cgemv_n_microk_bulldozer-4.c create mode 100644 kernel/x86_64/cgemv_t_microk_bulldozer-4.c diff --git a/kernel/x86_64/cgemv_n_microk_bulldozer-4.c b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c new file mode 100644 index 000000000..a74b41269 --- /dev/null +++ b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c @@ -0,0 +1,541 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 + + "cmpq $0 , %1 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "prefetcht0 384(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%6,%0,4) \n\t" + "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 + "vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2 + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%7,%0,4) \n\t" + "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 + "vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3 + + "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $4, %8 \n\t" + "jne 3f \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 + "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 + + "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (n2) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x2 1 +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 + + "cmpq $0 , %1 \n\t" + "je 2f \n\t" + + // ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "prefetcht0 384(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $4, %6 \n\t" + "jne 3f \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (n2) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + + "cmpq $0 , %1 \n\t" + "je 2f \n\t" + + // ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "addq $16, %0 \n\t" + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "subq $8 , %1 \n\t" + "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13,-32(%3,%0,4) \n\t" + + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $4, %5 \n\t" + "jne 3f \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (n2) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_ADDY 1 + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i Date: Thu, 10 Mar 2016 11:10:38 +0100 Subject: [PATCH 32/37] FIX: forgot the add the files cgemv_n_4.c and cgemv_t_4.c --- kernel/x86_64/cgemv_n_4.c | 2 ++ kernel/x86_64/cgemv_t_4.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index ff8058549..d60e4475d 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cgemv_n_microk_haswell-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "cgemv_n_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index b383a4869..b558164ff 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cgemv_t_microk_haswell-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "cgemv_t_microk_bulldozer-4.c" #endif #define NBMAX 2048 From 323c237e7b5b5fa37ceb69a3bd8869749ca263e4 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Thu, 10 Mar 2016 20:24:41 +0100 Subject: [PATCH 33/37] Fix smallscaling compilation Also revert 0bbca5e --- benchmark/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 6354b956a..11d3c5bec 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -2133,7 +2134,7 @@ zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ smallscaling: smallscaling.c ../$(LIBNAME) - $(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^ + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib From d5e1255ca7a60ccb2f06045e8348d379b0dc13f1 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Sun, 13 Mar 2016 18:56:21 -0400 Subject: [PATCH 34/37] Don't pass REALNAME to `.end` Putting the procedure there is an MSVC-ism, where it is optional. GCC silently ignores and Clang errors, so it is best to remove this. --- common_x86_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index da9afc0e4..11937b415 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -396,7 +396,7 @@ REALNAME: #define PROFCODE -#define EPILOGUE .end REALNAME +#define EPILOGUE .end #endif #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) From 7aa1ad4923a4b8f00d9f3f1e33d069fb04eff3f3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 14 Mar 2016 19:33:21 +0530 Subject: [PATCH 35/37] Functional Assembly Kernels for CortexA57 Adding functional (non-optimized) kernels for Cortex-A57 with the following layouts. SGEMM - 16x4, 8x8 CGEMM - 8x4 DGEMM - 8x4, 4x8 --- kernel/arm64/KERNEL.CORTEXA57 | 57 +- kernel/arm64/cgemm_kernel_8x4.S | 2044 ++++++++++++++++++++++ kernel/arm64/ctrmm_kernel_8x4.S | 2425 ++++++++++++++++++++++++++ kernel/arm64/dgemm_kernel_4x8.S | 1689 ++++++++++++++++++ kernel/arm64/dgemm_kernel_8x4.S | 1570 +++++++++++++++++ kernel/arm64/dtrmm_kernel_4x8.S | 2026 ++++++++++++++++++++++ kernel/arm64/dtrmm_kernel_8x4.S | 1849 ++++++++++++++++++++ kernel/arm64/sgemm_kernel_16x4.S | 1987 +++++++++++++++++++++ kernel/arm64/sgemm_kernel_8x8.S | 2305 ++++++++++++++++++++++++ kernel/arm64/strmm_kernel_16x4.S | 2431 ++++++++++++++++++++++++++ kernel/arm64/strmm_kernel_8x8.S | 2795 ++++++++++++++++++++++++++++++ 11 files changed, 21161 insertions(+), 17 deletions(-) create mode 100755 kernel/arm64/cgemm_kernel_8x4.S create mode 100755 kernel/arm64/ctrmm_kernel_8x4.S create mode 100755 kernel/arm64/dgemm_kernel_4x8.S create mode 100755 kernel/arm64/dgemm_kernel_8x4.S create mode 100755 kernel/arm64/dtrmm_kernel_4x8.S create mode 100755 kernel/arm64/dtrmm_kernel_8x4.S create mode 100644 kernel/arm64/sgemm_kernel_16x4.S create mode 100644 kernel/arm64/sgemm_kernel_8x8.S create mode 100755 kernel/arm64/strmm_kernel_16x4.S create mode 100755 kernel/arm64/strmm_kernel_8x8.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 7c8eeeea7..64666f05b 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S -STRMMKERNEL = strmm_kernel_4x4.S -DTRMMKERNEL = dtrmm_kernel_4x4.S -CTRMMKERNEL = ctrmm_kernel_4x4.S -ZTRMMKERNEL = ztrmm_kernel_4x4.S - -SGEMMKERNEL = sgemm_kernel_4x4.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = dgemm_kernel_4x4.S -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = cgemm_kernel_4x4.S -CGEMMONCOPY = ../generic/zgemm_ncopy_4.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = zgemm_kernel_4x4.S -ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S new file mode 100755 index 000000000..40b98cee2 --- /dev/null +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -0,0 +1,2044 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R +//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I +//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R +//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I +//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R +//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I +//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R +//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I +//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R +//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R +//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R +//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I +//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R +//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I +//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R +//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I +//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R +//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I +//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R +//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I +//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R +//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I +//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R +//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I +//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R +//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.4s, v2.4s, v9.4s[0] +#else + fmul v19.4s, v2.4s, v9.4s[0] +#endif + OP_ir v19.4s, v3.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v23.16b, v23.16b, v23.16b + fmls v23.4s, v2.4s, v9.4s[1] +#else + fmul v23.4s, v2.4s, v9.4s[1] +#endif + OP_ir v23.4s, v3.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v27.16b, v27.16b, v27.16b + fmls v27.4s, v2.4s, v9.4s[2] +#else + fmul v27.4s, v2.4s, v9.4s[2] +#endif + OP_ir v27.4s, v3.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + fmul v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v31.16b, v31.16b, v31.16b + fmls v31.4s, v2.4s, v9.4s[3] +#else + fmul v31.4s, v2.4s, v9.4s[3] +#endif + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + +.endm + +.macro KERNEL8x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + +.endm + +.macro SAVE8x4 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmla v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v26.4s, alphaV0_R + fmls v2.4s, v27.4s, alphaV0_I + fmla v3.4s, v26.4s, alphaV1_I + fmla v3.4s, v27.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v30.4s, alphaV0_R + fmls v6.4s, v31.4s, alphaV0_I + fmla v7.4s, v30.4s, alphaV1_I + fmla v7.4s, v31.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + ld2 {v0.4s, v1.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmla v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmla v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmla s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmla s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 +.endm + +.macro KERNEL8x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v18.4s, v2.4s, v8.2s[0] + OP_ii v18.4s, v3.4s, v9.2s[0] + OP_ri v19.4s, v2.4s, v9.2s[0] + OP_ir v19.4s, v3.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] + + OP_rr v22.4s, v2.4s, v8.2s[1] + OP_ii v22.4s, v3.4s, v9.2s[1] + OP_ri v23.4s, v2.4s, v9.2s[1] + OP_ir v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmla v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL8x1_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v8.4s[1] + OP_ri v17.4s, v0.4s, v8.4s[1] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v8.4s[1] + OP_ri v19.4s, v2.4s, v8.4s[1] + OP_ir v19.4s, v3.4s, v8.4s[0] +.endm + +.macro SAVE8x1 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble cgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +cgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + mov pA, origPA // pA = start of A array + +cgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L4_M4_BEGIN + +cgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt cgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble cgemm_kernel_L4_M8_22a + .align 5 + +cgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M8_22 + + +cgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + +cgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble cgemm_kernel_L4_M8_40 + + KERNEL8x4_I + + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + +cgemm_kernel_L4_M8_40: + + INIT8x4 + +cgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble cgemm_kernel_L4_M8_100 + +cgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +cgemm_kernel_L4_M8_100: + + SAVE8x4 + +cgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne cgemm_kernel_L4_M8_20 + +cgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L4_END + + tst counterI, #4 + ble cgemm_kernel_L4_M2_BEGIN + + +cgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt cgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble cgemm_kernel_L4_M4_22a + .align 5 + + +cgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M4_22 + +cgemm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_E + b cgemm_kernel_L4_M4_44 +cgemm_kernel_L4_M4_32: + tst counterL, #1 + ble cgemm_kernel_L4_M4_40 + KERNEL4x4_I + KERNEL4x4_E + b cgemm_kernel_L4_M4_44 +cgemm_kernel_L4_M4_40: + + INIT4x4 + +cgemm_kernel_L4_M4_44: + ands counterL , origK, #1 + ble cgemm_kernel_L4_M4_100 + +cgemm_kernel_L4_M4_46: + KERNEL4x4_SUB + +cgemm_kernel_L4_M4_100: + + SAVE4x4 + +cgemm_kernel_L4_M4_END: + +cgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L4_M1_BEGIN + +cgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M2_40 + +cgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_22 + + +cgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M2_100 + +cgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_42 + +cgemm_kernel_L4_M2_100: + + SAVE2x4 + +cgemm_kernel_L4_M2_END: + + +cgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L4_END + +cgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M1_40 + +cgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_22 + + +cgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M1_100 + +cgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_42 + +cgemm_kernel_L4_M1_100: + + SAVE1x4 + + +cgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt cgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +cgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble cgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble cgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + +cgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L2_M4_BEGIN + +cgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M8_40 + .align 5 + +cgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M8_22 + + +cgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M8_100 + +cgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M8_42 + +cgemm_kernel_L2_M8_100: + + SAVE8x2 + +cgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L2_M8_20 + +cgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble cgemm_kernel_L2_M2_BEGIN + +cgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M4_40 + .align 5 + +cgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_22 + + +cgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M4_100 + +cgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_42 + +cgemm_kernel_L2_M4_100: + + SAVE4x2 + +cgemm_kernel_L2_M4_END: + +cgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M2_40 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_22 + + +cgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_42 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + +cgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble cgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + +cgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L1_M4_BEGIN + +cgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M8_40 + .align 5 + +cgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M8_22 + + +cgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M8_100 + +cgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M8_42 + +cgemm_kernel_L1_M8_100: + + SAVE8x1 + +cgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L1_M8_20 + +cgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble cgemm_kernel_L1_M2_BEGIN + + +cgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M4_40 + .align 5 + +cgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_22 + + +cgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M4_100 + +cgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_42 + +cgemm_kernel_L1_M4_100: + + SAVE4x1 + +cgemm_kernel_L1_M4_END: + + +cgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M2_40 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_22 + + +cgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_42 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + +cgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + +cgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S new file mode 100755 index 000000000..3131541d4 --- /dev/null +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -0,0 +1,2425 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R +//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I +//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R +//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I +//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R +//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I +//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R +//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I +//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R +//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R +//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R +//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I +//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R +//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I +//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R +//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I +//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R +//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I +//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R +//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I +//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R +//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I +//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R +//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I +//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R +//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.4s, v2.4s, v9.4s[0] +#else + fmul v19.4s, v2.4s, v9.4s[0] +#endif + OP_ir v19.4s, v3.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v23.16b, v23.16b, v23.16b + fmls v23.4s, v2.4s, v9.4s[1] +#else + fmul v23.4s, v2.4s, v9.4s[1] +#endif + OP_ir v23.4s, v3.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v27.16b, v27.16b, v27.16b + fmls v27.4s, v2.4s, v9.4s[2] +#else + fmul v27.4s, v2.4s, v9.4s[2] +#endif + OP_ir v27.4s, v3.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + fmul v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v31.16b, v31.16b, v31.16b + fmls v31.4s, v2.4s, v9.4s[3] +#else + fmul v31.4s, v2.4s, v9.4s[3] +#endif + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + +.endm + +.macro KERNEL8x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + +.endm + +.macro SAVE8x4 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmul v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmul v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmul v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v2.4s, v26.4s, alphaV0_R + fmls v2.4s, v27.4s, alphaV0_I + fmul v3.4s, v26.4s, alphaV1_I + fmla v3.4s, v27.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmul v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v6.4s, v30.4s, alphaV0_R + fmls v6.4s, v31.4s, alphaV0_I + fmul v7.4s, v30.4s, alphaV1_I + fmla v7.4s, v31.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + ld2 {v0.4s, v1.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmul v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmul v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmul v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmul v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmul s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmul s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 +.endm + +.macro KERNEL8x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v18.4s, v2.4s, v8.2s[0] + OP_ii v18.4s, v3.4s, v9.2s[0] + OP_ri v19.4s, v2.4s, v9.2s[0] + OP_ir v19.4s, v3.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] + + OP_rr v22.4s, v2.4s, v8.2s[1] + OP_ii v22.4s, v3.4s, v9.2s[1] + OP_ri v23.4s, v2.4s, v9.2s[1] + OP_ir v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmul v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmul v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL8x1_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v8.4s[1] + OP_ri v17.4s, v0.4s, v8.4s[1] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v8.4s[1] + OP_ri v19.4s, v2.4s, v8.4s[1] + OP_ir v19.4s, v3.4s, v8.4s[0] +.endm + +.macro SAVE8x1 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, #32 + + + fmul v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmul v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble ctrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +ctrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +ctrmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble ctrmm_kernel_L4_M4_BEGIN + +ctrmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble ctrmm_kernel_L4_M8_22a + .align 5 + +ctrmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M8_22 + + +ctrmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b ctrmm_kernel_L4_M8_44 + +ctrmm_kernel_L4_M8_32: + + tst counterL, #1 + ble ctrmm_kernel_L4_M8_40 + + KERNEL8x4_I + + KERNEL8x4_E + + b ctrmm_kernel_L4_M8_44 + +ctrmm_kernel_L4_M8_40: + + INIT8x4 + +ctrmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble ctrmm_kernel_L4_M8_100 + +ctrmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +ctrmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +ctrmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne ctrmm_kernel_L4_M8_20 + +ctrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble ctrmm_kernel_L4_END + + tst counterI, #4 + ble ctrmm_kernel_L4_M2_BEGIN + +ctrmm_kernel_L4_M4_20: + + INIT4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M4_40 + +ctrmm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M4_22 + + +ctrmm_kernel_L4_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M4_100 + +ctrmm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M4_42 + +ctrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L4_M4_END: + + +ctrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L4_M1_BEGIN + +ctrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M2_40 + +ctrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_22 + + +ctrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M2_100 + +ctrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_42 + +ctrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L4_M2_END: + + +ctrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L4_END + +ctrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M1_40 + +ctrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_22 + + +ctrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M1_100 + +ctrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_42 + +ctrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt ctrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble ctrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble ctrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +ctrmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble ctrmm_kernel_L2_M4_BEGIN + +ctrmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M8_40 + .align 5 + +ctrmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M8_22 + + +ctrmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M8_100 + +ctrmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M8_42 + +ctrmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +ctrmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L2_M8_20 + +ctrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble ctrmm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble ctrmm_kernel_L2_M2_BEGIN + +ctrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M4_40 + .align 5 + +ctrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_22 + + +ctrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M4_100 + +ctrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_42 + +ctrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L2_M4_END: + + +ctrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L2_M1_BEGIN + +ctrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M2_40 + +ctrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_22 + + +ctrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M2_100 + +ctrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_42 + +ctrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L2_M2_END: + + +ctrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L2_END + +ctrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble ctrmm_kernel_L2_M1_40 + +ctrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_22 + + +ctrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M1_100 + +ctrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_42 + +ctrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +ctrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble ctrmm_kernel_L999 // done + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +ctrmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble ctrmm_kernel_L1_M4_BEGIN + +ctrmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M8_40 + .align 5 + +ctrmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M8_22 + + +ctrmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M8_100 + +ctrmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M8_42 + +ctrmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +ctrmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L1_M8_20 + +ctrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble ctrmm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble ctrmm_kernel_L1_M2_BEGIN + +ctrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M4_40 + .align 5 + +ctrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_22 + + +ctrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M4_100 + +ctrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_42 + +ctrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L1_M4_END: + +ctrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L1_M1_BEGIN + +ctrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M2_40 + +ctrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_22 + + +ctrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M2_100 + +ctrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_42 + +ctrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L1_M2_END: + + +ctrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L1_END + +ctrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M1_40 + +ctrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_22 + + +ctrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M1_100 + +ctrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_42 + +ctrmm_kernel_L1_M1_100: + + SAVE1x1 + + +ctrmm_kernel_L1_END: + + +ctrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S new file mode 100755 index 000000000..88e9a773d --- /dev/null +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -0,0 +1,1689 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 + +#define alpha0 d2 +#define alphaV0 v2.d[0] +#define alpha1 d3 +#define alphaV1 v3.d[0] +#define alpha2 d6 +#define alphaV2 v6.d[0] +#define alpha3 d7 +#define alphaV3 v7.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ALPHA0 +//v03 ALPHA1 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ALPHA2 +//v07 ALPHA3 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save pB0_4, pB0_5 +//v11 must save pB0_6, pB0_7 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save pB1_4, pB1_5 +//v15 must save pB1_6, pB1_7 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x8 + fmov d16, xzr + fmov d17, xzr + fmov d18, xzr + fmov d19, d16 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL4x8_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + fmul v18.2d, v0.2d, v8.2d[1] + fmul v19.2d, v1.2d, v8.2d[1] + + fmul v20.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + fmul v22.2d, v0.2d, v9.2d[1] + fmul v23.2d, v1.2d, v9.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + fmul v26.2d, v0.2d, v10.2d[1] + fmul v27.2d, v1.2d, v10.2d[1] + + fmul v28.2d, v0.2d, v11.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + fmul v30.2d, v0.2d, v11.2d[1] + fmul v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 +.endm + +.macro KERNEL4x8_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pA, #512] +.endm + +.macro KERNEL4x8_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pB, #512] +.endm + +.macro KERNEL4x8_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] +.endm + +.macro KERNEL4x8_SUB + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d, v11.2d}, [pCRow1] + fmla v10.2d, v18.2d, alphaV2 + fmla v11.2d, v19.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow2] + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v14.2d, v15.2d}, [pCRow1] + fmla v14.2d, v22.2d, alphaV2 + fmla v15.2d, v23.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d, v11.2d}, [pCRow1] + fmla v10.2d, v26.2d, alphaV2 + fmla v11.2d, v27.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow2] + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + ld1 {v14.2d, v15.2d}, [pCRow1] + fmla v14.2d, v30.2d, alphaV2 + fmla v15.2d, v31.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov d16, xzr + fmov d18, xzr + fmov d20, xzr + fmov d22, d16 + fmov d24, xzr + fmov d26, d16 + fmov d28, xzr + fmov d30, d16 +.endm + +.macro KERNEL2x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d}, [pCRow1] + fmla v10.2d, v18.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow2] + fmla v12.2d, v20.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v14.2d}, [pCRow1] + fmla v14.2d, v22.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + st1 {v8.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d}, [pCRow1] + fmla v10.2d, v26.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow2] + fmla v12.2d, v28.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v14.2d}, [pCRow1] + fmla v14.2d, v30.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov d16, xzr + fmov d20, xzr + fmov d24, xzr + fmov d28, xzr +.endm + +.macro KERNEL1x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ldr d0, [pA] + add pA, pA, #8 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] + fmla v24.2d, v10.2d, v0.d[0] + fmla v28.2d, v11.2d, v0.d[0] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v10.d}[0], [pCRow2] + ld1 {v10.d}[1], [pCRow1] + fmla v10.2d, v20.2d, alphaV1 + st1 {v10.d}[0], [pCRow2] + st1 {v10.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v24.2d, alphaV2 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v14.d}[0], [pCRow2] + ld1 {v14.d}[1], [pCRow1] + fmla v14.2d, v28.2d, alphaV3 + st1 {v14.d}[0], [pCRow2] + st1 {v14.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v9.2d[1] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v25.2d, v1.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v8.2d[1] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v17.2d, v1.2d, v8.2d[0] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV2 + fmla v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + ldr d8, [pCRow0] + fmadd d8, d16, alpha0, d8 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble dgemm_kernel_L4_BEGIN + +/******************************************************************************/ + +dgemm_kernel_L8_BEGIN: + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + + mov pA, origPA // pA = start of A array + +dgemm_kernel_L8_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L8_M2_BEGIN + +dgemm_kernel_L8_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dgemm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble dgemm_kernel_L8_M4_22a + .align 5 + +dgemm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M4_22 + + +dgemm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b dgemm_kernel_L8_M4_44 + +dgemm_kernel_L8_M4_32: + + tst counterL, #1 + ble dgemm_kernel_L8_M4_40 + + KERNEL4x8_I + + KERNEL4x8_E + + b dgemm_kernel_L8_M4_44 + + +dgemm_kernel_L8_M4_40: + + INIT4x8 + +dgemm_kernel_L8_M4_44: + + ands counterL , origK, #1 + ble dgemm_kernel_L8_M4_100 + +dgemm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +dgemm_kernel_L8_M4_100: + + SAVE4x8 + +dgemm_kernel_L8_M4_END: + subs counterI, counterI, #1 + bne dgemm_kernel_L8_M4_20 + +dgemm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L8_M1_BEGIN + +dgemm_kernel_L8_M2_20: + + INIT2x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L8_M2_40 + +dgemm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M2_22 + + +dgemm_kernel_L8_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L8_M2_100 + +dgemm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M2_42 + +dgemm_kernel_L8_M2_100: + + SAVE2x8 + +dgemm_kernel_L8_M2_END: + + +dgemm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L8_END + +dgemm_kernel_L8_M1_20: + + INIT1x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L8_M1_40 + +dgemm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M1_22 + + +dgemm_kernel_L8_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L8_M1_100 + +dgemm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M1_42 + +dgemm_kernel_L8_M1_100: + + SAVE1x8 + +dgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt dgemm_kernel_L8_BEGIN + + +/******************************************************************************/ + +dgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble dgemm_kernel_L999 + + tst counterJ , #4 + ble dgemm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + mov pA, origPA // pA = start of A array + +dgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_22 + + +dgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble dgemm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + + +dgemm_kernel_L4_M4_40: + + INIT4x4 + +dgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne dgemm_kernel_L4_M4_20 + +dgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + +dgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ + +dgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + +dgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + +dgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +dgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S new file mode 100755 index 000000000..a607fecc4 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -0,0 +1,1570 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 + +#define alpha0 d10 +#define alphaV0 v10.d[0] +#define alpha1 d11 +#define alphaV1 v11.d[0] +#define alpha2 d14 +#define alphaV2 v14.d[0] +#define alpha3 d15 +#define alphaV3 v15.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1 +//v01 pA0_2, pA0_3 +//v02 pA0_4, pA0_5 +//v03 pA0_6, pA0_7 +//v04 pA1_0, pA1_1 +//v05 pA1_2, pA1_3 +//v06 pA1_4, pA1_5 +//v07 pA1_6, pA1_7 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, xzr + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL8x4_I + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + + fmul v18.2d, v2.2d, v8.2d[0] + fmul v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + + fmul v22.2d, v2.2d, v9.2d[0] + fmul v23.2d, v3.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + + fmul v26.2d, v2.2d, v10.2d[0] + fmul v27.2d, v3.2d, v10.2d[0] + + fmul v28.2d, v0.2d, v11.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + + fmul v30.2d, v2.2d, v11.2d[0] + fmul v31.2d, v3.2d, v11.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v6.2d, v7.2d}, [pA] + add pA, pA, #32 + ldp d12, d13, [pB] + add pB, pB, #16 + ldp d14, d15, [pB] + add pB, pB, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] + + ld1 {v4.2d}, [pA], #16 + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + + ld1 {v5.2d}, [pA], #16 + + fmla v30.2d, v2.2d, v11.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] + + ldp d12, d13, [pB] + add pB, pB, #16 + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + + ldp d14, d15, [pB] + add pB, pB, #16 + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + ld1 {v6.2d}, [pA], #16 + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + + ld1 {v7.2d}, [pA], #16 + + fmla v22.2d, v2.2d, v9.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + prfm PLDL1KEEP, [pA, #224] + prfm PLDL1KEEP, [pA, #224+64] +.endm + +.macro KERNEL8x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] + + ld1 {v0.2d}, [pA], #16 + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + + ld1 {v1.2d}, [pA], #16 + + fmla v30.2d, v6.2d, v15.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] + + ldp d8, d9, [pB] + add pB, pB, #16 + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + + ldp d10, d11, [pB] + add pB, pB, #16 + + fmla v22.2d, v6.2d, v13.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + ld1 {v2.2d}, [pA], #16 + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + + ld1 {v3.2d}, [pA], #16 + + fmla v18.2d, v6.2d, v12.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + + prfm PLDL1KEEP, [pB, #640] +.endm + +.macro KERNEL8x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v6.2d, v13.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v6.2d, v15.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] +.endm + +.macro KERNEL8x4_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v2.2d, v9.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] +.endm + +.macro SAVE8x4 + fmov alpha0, alpha + + ld1 {v0.2d, v1.2d}, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV0 + st1 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld1 {v2.2d, v3.2d}, [pCRow0] + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 + st1 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld1 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0 + fmla v5.2d, v21.2d, alphaV0 + st1 {v4.2d, v5.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld1 {v6.2d, v7.2d}, [pCRow1] + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 + st1 {v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld1 {v0.2d, v1.2d}, [pCRow2] + fmla v0.2d, v24.2d, alphaV0 + fmla v1.2d, v25.2d, alphaV0 + st1 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + ld1 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v26.2d, alphaV0 + fmla v3.2d, v27.2d, alphaV0 + st1 {v2.2d, v3.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + + ld1 {v4.2d, v5.2d}, [pCRow3] + fmla v4.2d, v28.2d, alphaV0 + fmla v5.2d, v29.2d, alphaV0 + st1 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + ld1 {v6.2d, v7.2d}, [pCRow3] + fmla v6.2d, v30.2d, alphaV0 + fmla v7.2d, v31.2d, alphaV0 + st1 {v6.2d, v7.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + prfm PLDL2KEEP, [pCRow0, #128] + prfm PLDL2KEEP, [pCRow1, #128] + prfm PLDL2KEEP, [pCRow2, #128] + prfm PLDL2KEEP, [pCRow3, #128] +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV2 + fmla v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 +.endm + +.macro KERNEL8x2_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV1 + fmla v2.2d, v18.2d, alphaV2 + fmla v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0 + fmla v5.2d, v21.2d, alphaV1 + fmla v6.2d, v22.2d, alphaV2 + fmla v7.2d, v23.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL8x1_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro SAVE8x1 + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV1 + fmla v2.2d, v18.2d, alphaV2 + fmla v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + ldr d8, [pCRow0] + fmadd d8, d16, alpha0, d8 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +dgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +dgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L4_M4_BEGIN + +dgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt dgemm_kernel_L4_M8_32 + + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble dgemm_kernel_L4_M8_22a + .align 5 + +dgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M8_22 + + +dgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + +dgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble dgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + +dgemm_kernel_L4_M8_40: + + INIT8x4 + +dgemm_kernel_L4_M8_44: + + ands counterL , origK, #7 + ble dgemm_kernel_L4_M8_100 + +dgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + + subs counterL, counterL, #1 + bne dgemm_kernel_L4_M8_46 + +dgemm_kernel_L4_M8_100: + + SAVE8x4 + +dgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne dgemm_kernel_L4_M8_20 + +dgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L4_END + + tst counterI, #4 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + INIT4x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M4_40 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_42 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + + +dgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + +dgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt dgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +dgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L2_M4_BEGIN + +dgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M8_40 + .align 5 + +dgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M8_22 + + +dgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M8_100 + +dgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M8_42 + +dgemm_kernel_L2_M8_100: + + SAVE8x2 + +dgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L2_M8_20 + +dgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + +dgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + +dgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dgemm_kernel_L999 // done + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +dgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L1_M4_BEGIN + +dgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M8_40 + .align 5 + +dgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M8_22 + + +dgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M8_100 + +dgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M8_42 + +dgemm_kernel_L1_M8_100: + + SAVE8x1 + +dgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L1_M8_20 + +dgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + +dgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S new file mode 100755 index 000000000..eb7397faa --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -0,0 +1,2026 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 d2 +#define alphaV0 v2.d[0] +#define alpha1 d3 +#define alphaV1 v3.d[0] +#define alpha2 d6 +#define alphaV2 v6.d[0] +#define alpha3 d7 +#define alphaV3 v7.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ALPHA0 +//v03 ALPHA1 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ALPHA2 +//v07 ALPHA3 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save pB0_4, pB0_5 +//v11 must save pB0_6, pB0_7 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save pB1_4, pB1_5 +//v15 must save pB1_6, pB1_7 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x8 + fmov d16, xzr + fmov d17, xzr + fmov d18, xzr + fmov d19, d16 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL4x8_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + fmul v18.2d, v0.2d, v8.2d[1] + fmul v19.2d, v1.2d, v8.2d[1] + + fmul v20.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + fmul v22.2d, v0.2d, v9.2d[1] + fmul v23.2d, v1.2d, v9.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + fmul v26.2d, v0.2d, v10.2d[1] + fmul v27.2d, v1.2d, v10.2d[1] + + fmul v28.2d, v0.2d, v11.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + fmul v30.2d, v0.2d, v11.2d[1] + fmul v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 +.endm + +.macro KERNEL4x8_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pA, #512] +.endm + +.macro KERNEL4x8_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pB, #512] +.endm + +.macro KERNEL4x8_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] +.endm + +.macro KERNEL4x8_SUB + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v18.2d, alphaV2 + fmul v11.2d, v19.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v14.2d, v22.2d, alphaV2 + fmul v15.2d, v23.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v26.2d, alphaV2 + fmul v11.2d, v27.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV0 + fmul v13.2d, v29.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + fmul v14.2d, v30.2d, alphaV2 + fmul v15.2d, v31.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov d16, xzr + fmov d18, xzr + fmov d20, xzr + fmov d22, d16 + fmov d24, xzr + fmov d26, d16 + fmov d28, xzr + fmov d30, d16 +.endm + +.macro KERNEL2x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v18.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v14.2d, v22.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v8.2d, v24.2d, alphaV0 + st1 {v8.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v26.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v14.2d, v30.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov d16, xzr + fmov d20, xzr + fmov d24, xzr + fmov d28, xzr +.endm + +.macro KERNEL1x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ldr d0, [pA] + add pA, pA, #8 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] + fmla v24.2d, v10.2d, v0.d[0] + fmla v28.2d, v11.2d, v0.d[0] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v10.2d, v20.2d, alphaV1 + st1 {v10.d}[0], [pCRow2] + st1 {v10.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v24.2d, alphaV2 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v14.2d, v28.2d, alphaV3 + st1 {v14.d}[0], [pCRow2] + st1 {v14.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v9.2d[1] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v25.2d, v1.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v8.2d[1] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v17.2d, v1.2d, v8.2d[0] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV2 + fmul v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + fmul d8, d16, alpha0 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble dtrmm_kernel_L4_BEGIN + +/******************************************************************************/ + +dtrmm_kernel_L8_BEGIN: + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L8_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L8_M2_BEGIN + +dtrmm_kernel_L8_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL, tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble dtrmm_kernel_L8_M4_22a + .align 5 + +dtrmm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M4_22 + + +dtrmm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b dtrmm_kernel_L8_M4_44 + +dtrmm_kernel_L8_M4_32: + + tst counterL, #1 + ble dtrmm_kernel_L8_M4_40 + + KERNEL4x8_I + + KERNEL4x8_E + + b dtrmm_kernel_L8_M4_44 + + +dtrmm_kernel_L8_M4_40: + + INIT4x8 + +dtrmm_kernel_L8_M4_44: + + ands counterL, tempK, #1 + ble dtrmm_kernel_L8_M4_100 + +dtrmm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +dtrmm_kernel_L8_M4_100: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L8_M4_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L8_M4_20 + +dtrmm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L8_M1_BEGIN + +dtrmm_kernel_L8_M2_20: + + INIT2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL, tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L8_M2_40 + +dtrmm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M2_22 + + +dtrmm_kernel_L8_M2_40: + + ands counterL, tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L8_M2_100 + +dtrmm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M2_42 + +dtrmm_kernel_L8_M2_100: + + SAVE2x8 + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L8_M2_END: + + +dtrmm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L8_END + +dtrmm_kernel_L8_M1_20: + + INIT1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL, tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L8_M1_40 + +dtrmm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M1_22 + + +dtrmm_kernel_L8_M1_40: + + ands counterL, tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L8_M1_100 + +dtrmm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M1_42 + +dtrmm_kernel_L8_M1_100: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt dtrmm_kernel_L8_BEGIN + + +/******************************************************************************/ + +dtrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble dtrmm_kernel_L999 + + tst counterJ , #4 + ble dtrmm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L4_M2_BEGIN + +dtrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL, tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble dtrmm_kernel_L4_M4_22a + .align 5 + +dtrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_22 + + +dtrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + +dtrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble dtrmm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + + +dtrmm_kernel_L4_M4_40: + + INIT4x4 + +dtrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble dtrmm_kernel_L4_M4_100 + +dtrmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +dtrmm_kernel_L4_M4_100: + + SAVE4x4 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L4_M4_20 + +dtrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L4_M1_BEGIN + +dtrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M2_40 + +dtrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_22 + + +dtrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M2_100 + +dtrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_42 + +dtrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +dtrmm_kernel_L4_M2_END: + + +dtrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L4_END + +dtrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M1_40 + +dtrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_22 + + +dtrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M1_100 + +dtrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_42 + +dtrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +dtrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ + +dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dtrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dtrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + + +dtrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dtrmm_kernel_L2_M2_BEGIN + +dtrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M4_40 + .align 5 + +dtrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_22 + + +dtrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M4_100 + +dtrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_42 + +dtrmm_kernel_L2_M4_100: + + SAVE4x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L2_M4_20 + + +dtrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M1_BEGIN + +dtrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M2_40 + +dtrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_22 + + +dtrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M2_100 + +dtrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_42 + +dtrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +dtrmm_kernel_L2_M2_END: + + +dtrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L2_END + +dtrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dtrmm_kernel_L2_M1_40 + +dtrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_22 + + +dtrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M1_100 + +dtrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_42 + +dtrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +dtrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dtrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +dtrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L1_M2_BEGIN + +dtrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M4_40 + .align 5 + +dtrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_22 + + +dtrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M4_100 + +dtrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_42 + +dtrmm_kernel_L1_M4_100: + + SAVE4x1 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L1_M4_20 + + +dtrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M1_BEGIN + +dtrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M2_40 + +dtrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_22 + + +dtrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M2_100 + +dtrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_42 + +dtrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +dtrmm_kernel_L1_M2_END: + + +dtrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L1_END + +dtrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M1_40 + +dtrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_22 + + +dtrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M1_100 + +dtrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_42 + +dtrmm_kernel_L1_M1_100: + + SAVE1x1 + + +dtrmm_kernel_L1_END: + + +dtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S new file mode 100755 index 000000000..6890505bd --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -0,0 +1,1849 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 d10 +#define alphaV0 v10.d[0] +#define alpha1 d11 +#define alphaV1 v11.d[0] +#define alpha2 d14 +#define alphaV2 v14.d[0] +#define alpha3 d15 +#define alphaV3 v15.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1 +//v01 pA0_2, pA0_3 +//v02 pA0_4, pA0_5 +//v03 pA0_6, pA0_7 +//v04 pA1_0, pA1_1 +//v05 pA1_2, pA1_3 +//v06 pA1_4, pA1_5 +//v07 pA1_6, pA1_7 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, xzr + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL8x4_I + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + fmul v18.2d, v2.2d, v8.2d[0] + fmul v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v21.2d, v1.2d, v8.2d[1] + fmul v22.2d, v2.2d, v8.2d[1] + fmul v23.2d, v3.2d, v8.2d[1] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v25.2d, v1.2d, v9.2d[0] + fmul v26.2d, v2.2d, v9.2d[0] + fmul v27.2d, v3.2d, v9.2d[0] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v29.2d, v1.2d, v9.2d[1] + fmul v30.2d, v2.2d, v9.2d[1] + fmul v31.2d, v3.2d, v9.2d[1] + + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v6.2d, v7.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v9.2d[0] + fmla v26.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v9.2d[0] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v29.2d, v1.2d, v9.2d[1] + fmla v30.2d, v2.2d, v9.2d[1] + fmla v31.2d, v3.2d, v9.2d[1] + + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v6.2d, v7.2d}, [pA] + add pA, pA, #32 + + prfm PLDL1KEEP, [pA, #512] +.endm + +.macro KERNEL8x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v21.2d, v5.2d, v12.2d[1] + fmla v22.2d, v6.2d, v12.2d[1] + fmla v23.2d, v7.2d, v12.2d[1] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v25.2d, v5.2d, v13.2d[0] + fmla v26.2d, v6.2d, v13.2d[0] + fmla v27.2d, v7.2d, v13.2d[0] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v29.2d, v5.2d, v13.2d[1] + fmla v30.2d, v6.2d, v13.2d[1] + fmla v31.2d, v7.2d, v13.2d[1] + + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + prfm PLDL1KEEP, [pB, #512] +.endm + +.macro KERNEL8x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v21.2d, v5.2d, v12.2d[1] + fmla v22.2d, v6.2d, v12.2d[1] + fmla v23.2d, v7.2d, v12.2d[1] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v25.2d, v5.2d, v13.2d[0] + fmla v26.2d, v6.2d, v13.2d[0] + fmla v27.2d, v7.2d, v13.2d[0] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v29.2d, v5.2d, v13.2d[1] + fmla v30.2d, v6.2d, v13.2d[1] + fmla v31.2d, v7.2d, v13.2d[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v9.2d[0] + fmla v26.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v9.2d[0] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v29.2d, v1.2d, v9.2d[1] + fmla v30.2d, v2.2d, v9.2d[1] + fmla v31.2d, v3.2d, v9.2d[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + fmul v0.2d, v16.2d, alphaV0 + fmul v1.2d, v17.2d, alphaV1 + fmul v2.2d, v18.2d, alphaV2 + fmul v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0 + fmul v5.2d, v21.2d, alphaV1 + fmul v6.2d, v22.2d, alphaV2 + fmul v7.2d, v23.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.2d, v24.2d, alphaV0 + fmul v1.2d, v25.2d, alphaV1 + fmul v2.2d, v26.2d, alphaV2 + fmul v3.2d, v27.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] + + fmul v4.2d, v28.2d, alphaV0 + fmul v5.2d, v29.2d, alphaV1 + fmul v6.2d, v30.2d, alphaV2 + fmul v7.2d, v31.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV2 + fmul v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 +.endm + +.macro KERNEL8x2_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + fmul v0.2d, v16.2d, alphaV0 + fmul v1.2d, v17.2d, alphaV1 + fmul v2.2d, v18.2d, alphaV2 + fmul v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + fmul v4.2d, v20.2d, alphaV0 + fmul v5.2d, v21.2d, alphaV1 + fmul v6.2d, v22.2d, alphaV2 + fmul v7.2d, v23.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL8x1_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro SAVE8x1 + fmul v0.2d, v16.2d, alphaV0 + fmul v1.2d, v17.2d, alphaV1 + fmul v2.2d, v18.2d, alphaV2 + fmul v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + fmul d8, d16, alpha0 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dtrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +dtrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dtrmm_kernel_L4_M4_BEGIN + +dtrmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble dtrmm_kernel_L4_M8_22a + .align 5 + +dtrmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M8_22 + + +dtrmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b dtrmm_kernel_L4_M8_44 + +dtrmm_kernel_L4_M8_32: + + tst counterL, #1 + ble dtrmm_kernel_L4_M8_40 + + KERNEL8x4_I + + KERNEL8x4_E + + b dtrmm_kernel_L4_M8_44 + +dtrmm_kernel_L4_M8_40: + + INIT8x4 + +dtrmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble dtrmm_kernel_L4_M8_100 + +dtrmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +dtrmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +dtrmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L4_M8_20 + +dtrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dtrmm_kernel_L4_END + + tst counterI, #4 + ble dtrmm_kernel_L4_M2_BEGIN + +dtrmm_kernel_L4_M4_20: + + INIT4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M4_40 + +dtrmm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_22 + + +dtrmm_kernel_L4_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M4_100 + +dtrmm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_42 + +dtrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L4_M4_END: + + +dtrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L4_M1_BEGIN + +dtrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M2_40 + +dtrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_22 + + +dtrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M2_100 + +dtrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_42 + +dtrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L4_M2_END: + + +dtrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L4_END + +dtrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M1_40 + +dtrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_22 + + +dtrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M1_100 + +dtrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_42 + +dtrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt dtrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dtrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dtrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +dtrmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dtrmm_kernel_L2_M4_BEGIN + +dtrmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M8_40 + .align 5 + +dtrmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M8_22 + + +dtrmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M8_100 + +dtrmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M8_42 + +dtrmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +dtrmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L2_M8_20 + +dtrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dtrmm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M2_BEGIN + +dtrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M4_40 + .align 5 + +dtrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_22 + + +dtrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M4_100 + +dtrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_42 + +dtrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L2_M4_END: + + +dtrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M1_BEGIN + +dtrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M2_40 + +dtrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_22 + + +dtrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M2_100 + +dtrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_42 + +dtrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L2_M2_END: + + +dtrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L2_END + +dtrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dtrmm_kernel_L2_M1_40 + +dtrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_22 + + +dtrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M1_100 + +dtrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_42 + +dtrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dtrmm_kernel_L999 // done + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +dtrmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dtrmm_kernel_L1_M4_BEGIN + +dtrmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M8_40 + .align 5 + +dtrmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M8_22 + + +dtrmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M8_100 + +dtrmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M8_42 + +dtrmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +dtrmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L1_M8_20 + +dtrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dtrmm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M2_BEGIN + +dtrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M4_40 + .align 5 + +dtrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_22 + + +dtrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M4_100 + +dtrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_42 + +dtrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L1_M4_END: + +dtrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M1_BEGIN + +dtrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M2_40 + +dtrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_22 + + +dtrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M2_100 + +dtrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_42 + +dtrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L1_M2_END: + + +dtrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L1_END + +dtrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M1_40 + +dtrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_22 + + +dtrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M1_100 + +dtrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_42 + +dtrmm_kernel_L1_M1_100: + + SAVE1x1 + + +dtrmm_kernel_L1_END: + + +dtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S new file mode 100644 index 000000000..22b55b01c --- /dev/null +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -0,0 +1,1987 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03 +//v01 pA0_04, pA0_05, pA0_06, pA0_07 +//v02 pA0_08, pA0_09, pA0_10, pA0_11 +//v03 pA0_12, pA0_13, pA0_14, pA0_15 +//v04 pA1_00, pA1_01, pA1_02, pA1_03 +//v05 pA1_04, pA1_05, pA1_06, pA1_07 +//v06 pA1_08, pA1_09, pA1_10, pA1_11 +//v07 pA1_12, pA1_13, pA1_14, pA1_15 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT16x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL16x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v18.4s, v2.4s, v8.2s[0] + fmul v19.4s, v3.4s, v8.2s[0] + + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v22.4s, v2.4s, v8.2s[1] + fmul v23.4s, v3.4s, v8.2s[1] + + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v26.4s, v2.4s, v9.2s[0] + fmul v27.4s, v3.4s, v9.2s[0] + + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + fmul v30.4s, v2.4s, v9.2s[1] + fmul v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] +.endm + +.macro KERNEL16x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] +.endm + +.macro SAVE16x4 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s, v9.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV0 + fmla v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV2 + fmla v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, wzr + fmov s23, s16 +.endm + +.macro KERNEL16x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE16x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL16x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] +.endm + +.macro SAVE16x1 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + ldr s8, [pCRow0] + fmla s8, s16, alphaV0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +sgemm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble sgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +sgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + mov pA, origPA // pA = start of A array + +sgemm_kernel_L4_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble sgemm_kernel_L4_M8_BEGIN + +sgemm_kernel_L4_M16_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M16_32 + + KERNEL16x4_I // do one in the K + KERNEL16x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M16_22a + .align 5 + +sgemm_kernel_L4_M16_22: + + KERNEL16x4_M1 + KERNEL16x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M16_22 + +sgemm_kernel_L4_M16_22a: + + KERNEL16x4_M1 + KERNEL16x4_E + + b sgemm_kernel_L4_M16_44 + +sgemm_kernel_L4_M16_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M16_40 + + KERNEL16x4_I + KERNEL16x4_E + + b sgemm_kernel_L4_M16_44 + +sgemm_kernel_L4_M16_40: + + INIT16x4 + +sgemm_kernel_L4_M16_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M16_100 + +sgemm_kernel_L4_M16_46: + + KERNEL16x4_SUB + +sgemm_kernel_L4_M16_100: + + SAVE16x4 + +sgemm_kernel_L4_M16_END: + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M16_20 + +//------------------------------------------------------------------------------ + +sgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L4_END + + tst counterI, #8 + ble sgemm_kernel_L4_M4_BEGIN + +sgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M8_22a + .align 5 + +sgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M8_22 + +sgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_40: + + INIT8x4 + +sgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M8_100 + +sgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +sgemm_kernel_L4_M8_100: + + SAVE8x4 + +sgemm_kernel_L4_M8_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L4_END + + tst counterI, #4 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_40: + + INIT4x4 + +sgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + + +sgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + +sgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt sgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +sgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble sgemm_kernel_L999 + + tst counterJ , #2 + ble sgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +sgemm_kernel_L2_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI,#0 + ble sgemm_kernel_L2_M8_BEGIN + +sgemm_kernel_L2_M16_20: + + INIT16x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M16_40 + .align 5 + +sgemm_kernel_L2_M16_22: + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M16_22 + + +sgemm_kernel_L2_M16_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M16_100 + +sgemm_kernel_L2_M16_42: + + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M16_42 + +sgemm_kernel_L2_M16_100: + + SAVE16x2 + +sgemm_kernel_L2_M16_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L2_M16_20 + +//------------------------------------------------------------------------------ + +sgemm_kernel_L2_M8_BEGIN: + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L2_END + + tst counterI, #8 + ble sgemm_kernel_L2_M4_BEGIN + +sgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M8_40 + .align 5 + +sgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_22 + + +sgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M8_100 + +sgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_42 + +sgemm_kernel_L2_M8_100: + + SAVE8x2 + +sgemm_kernel_L2_M8_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L2_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L2_END + + tst counterI, #4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + +//------------------------------------------------------------------------------ + + +sgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + +sgemm_kernel_L2_END: + + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble sgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +sgemm_kernel_L1_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble sgemm_kernel_L1_M8_BEGIN + +sgemm_kernel_L1_M16_20: + + INIT16x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M16_40 + .align 5 + +sgemm_kernel_L1_M16_22: + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M16_22 + + +sgemm_kernel_L1_M16_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M16_100 + +sgemm_kernel_L1_M16_42: + + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M16_42 + +sgemm_kernel_L1_M16_100: + + SAVE16x1 + +sgemm_kernel_L1_M16_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L1_M16_20 + +//------------------------------------------------------------------------------ + +sgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L1_END + + tst counterI, #8 + ble sgemm_kernel_L1_M4_BEGIN + +sgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M8_40 + .align 5 + +sgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_22 + + +sgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M8_100 + +sgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_42 + +sgemm_kernel_L1_M8_100: + + SAVE8x1 + +sgemm_kernel_L1_M8_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L1_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L1_END + + tst counterI, #4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + +sgemm_kernel_L1_END: + +sgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S new file mode 100644 index 000000000..ac690e4d4 --- /dev/null +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -0,0 +1,2305 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v17.4s, v1.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v19.4s, v1.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v21.4s, v1.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v23.4s, v1.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v25.4s, v1.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v27.4s, v1.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v29.4s, v1.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + fmul v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s, v5.4s}, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.4s, v7.4s}, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s, v5.4s}, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + ld1 {v6.4s, v7.4s}, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s}, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.4s}, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s}, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s}, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + ld1 {v6.4s}, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.4s[0] + fmla v18.2s, v0.2s, v4.4s[1] + fmla v20.2s, v0.2s, v4.4s[2] + fmla v22.2s, v0.2s, v4.4s[3] + fmla v24.2s, v0.2s, v5.4s[0] + fmla v26.2s, v0.2s, v5.4s[1] + fmla v28.2s, v0.2s, v5.4s[2] + fmla v30.2s, v0.2s, v5.4s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.2s}, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.2s}, [pCRow1] + fmla v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.2s}, [pCRow2] + fmla v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.2s}, [pCRow1] + fmla v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.2s}, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.2s}, [pCRow1] + fmla v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.2s}, [pCRow2] + fmla v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + ld1 {v6.2s}, [pCRow1] + fmla v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.4s[0] + fmla s18, s0, v4.4s[1] + fmla s20, s0, v4.4s[2] + fmla s22, s0, v4.4s[3] + fmla s24, s0, v5.4s[0] + fmla s26, s0, v5.4s[1] + fmla s28, s0, v5.4s[2] + fmla s30, s0, v5.4s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + ldr s0, [pCRow0] + fmla s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s6, [pCRow1] + fmla s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s0, [pCRow2] + fmla s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s28, alphaV0 + str s4, [pCRow2] + + ldr s6, [pCRow1] + fmla s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s, v9.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV0 + fmla v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV2 + fmla v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + ldr s8, [pCRow0] + fmla s8, s16, alphaV0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +sgemm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble sgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +sgemm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble sgemm_kernel_L8_M4_BEGIN + +sgemm_kernel_L8_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L8_M8_22a + .align 5 + +sgemm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M8_22 + +sgemm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_E + + b sgemm_kernel_L8_M8_44 + +sgemm_kernel_L8_M8_32: + + tst counterL, #1 + ble sgemm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_E + + b sgemm_kernel_L8_M8_44 + +sgemm_kernel_L8_M8_40: + + INIT8x8 + +sgemm_kernel_L8_M8_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L8_M8_100 + +sgemm_kernel_L8_M8_46: + + KERNEL8x8_SUB + +sgemm_kernel_L8_M8_100: + + SAVE8x8 + +sgemm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne sgemm_kernel_L8_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L8_END + + tst counterI, #4 + ble sgemm_kernel_L8_M2_BEGIN + +sgemm_kernel_L8_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L8_M4_22a + .align 5 + +sgemm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M4_22 + +sgemm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b sgemm_kernel_L8_M4_44 + +sgemm_kernel_L8_M4_32: + + tst counterL, #1 + ble sgemm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b sgemm_kernel_L8_M4_44 + +sgemm_kernel_L8_M4_40: + + INIT4x8 + +sgemm_kernel_L8_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L8_M4_100 + +sgemm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +sgemm_kernel_L8_M4_100: + + SAVE4x8 + +sgemm_kernel_L8_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L8_M1_BEGIN + +sgemm_kernel_L8_M2_20: + + INIT2x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L8_M2_40 + +sgemm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M2_22 + + +sgemm_kernel_L8_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L8_M2_100 + +sgemm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M2_42 + +sgemm_kernel_L8_M2_100: + + SAVE2x8 + +sgemm_kernel_L8_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L8_END + +sgemm_kernel_L8_M1_20: + + INIT1x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L8_M1_40 + +sgemm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M1_22 + + +sgemm_kernel_L8_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L8_M1_100 + +sgemm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M1_42 + +sgemm_kernel_L8_M1_100: + + SAVE1x8 + +sgemm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + + subs counterJ, counterJ , #1 // j-- + bgt sgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble sgemm_kernel_L999 + + tst counterJ , #4 + ble sgemm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +sgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble sgemm_kernel_L4_M4_BEGIN + +sgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M8_22a + .align 5 + +sgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M8_22 + +sgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_40: + + INIT8x4 + +sgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M8_100 + +sgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +sgemm_kernel_L4_M8_100: + + SAVE8x4 + +sgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L4_END + + tst counterI, #4 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_40: + + INIT4x4 + +sgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + +sgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble sgemm_kernel_L999 + + tst counterJ , #2 + ble sgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +sgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble sgemm_kernel_L2_M4_BEGIN + +sgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M8_40 + .align 5 + +sgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_22 + + +sgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M8_100 + +sgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_42 + +sgemm_kernel_L2_M8_100: + + SAVE8x2 + +sgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L2_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L2_END + + tst counterI, #4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + +sgemm_kernel_L2_END: + + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble sgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +/******************************************************************************/ + +sgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble sgemm_kernel_L1_M4_BEGIN + +sgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M8_40 + .align 5 + +sgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_22 + + +sgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M8_100 + +sgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_42 + +sgemm_kernel_L1_M8_100: + + SAVE8x1 + +sgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L1_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L1_END + + tst counterI, #4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + +sgemm_kernel_L1_END: + +/******************************************************************************/ + +sgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S new file mode 100755 index 000000000..b99760a03 --- /dev/null +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -0,0 +1,2431 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03 +//v01 pA0_04, pA0_05, pA0_06, pA0_07 +//v02 pA0_08, pA0_09, pA0_10, pA0_11 +//v03 pA0_12, pA0_13, pA0_14, pA0_15 +//v04 pA1_00, pA1_01, pA1_02, pA1_03 +//v05 pA1_04, pA1_05, pA1_06, pA1_07 +//v06 pA1_08, pA1_09, pA1_10, pA1_11 +//v07 pA1_12, pA1_13, pA1_14, pA1_15 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT16x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL16x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v18.4s, v2.4s, v8.2s[0] + fmul v19.4s, v3.4s, v8.2s[0] + + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v22.4s, v2.4s, v8.2s[1] + fmul v23.4s, v3.4s, v8.2s[1] + + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v26.4s, v2.4s, v9.2s[0] + fmul v27.4s, v3.4s, v9.2s[0] + + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + fmul v30.4s, v2.4s, v9.2s[1] + fmul v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] +.endm + +.macro KERNEL16x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] +.endm + +.macro SAVE16x4 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + fmul v2.4s, v26.4s, alphaV2 + fmul v3.4s, v27.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + fmul v6.4s, v30.4s, alphaV2 + fmul v7.4s, v31.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, wzr + fmov s23, s16 +.endm + +.macro KERNEL16x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE16x2 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL16x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] +.endm + +.macro SAVE16x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +strmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble strmm_kernel_L2_BEGIN + +/******************************************************************************/ + +strmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +strmm_kernel_L4_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble strmm_kernel_L4_M8_BEGIN + +strmm_kernel_L4_M16_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #16 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M16_32 + + KERNEL16x4_I // do one in the K + KERNEL16x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M16_22a + .align 5 + +strmm_kernel_L4_M16_22: + + KERNEL16x4_M1 + KERNEL16x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M16_22 + +strmm_kernel_L4_M16_22a: + + KERNEL16x4_M1 + KERNEL16x4_E + + b strmm_kernel_L4_M16_44 + +strmm_kernel_L4_M16_32: + + tst counterL, #1 + ble strmm_kernel_L4_M16_40 + + KERNEL16x4_I + KERNEL16x4_E + + b strmm_kernel_L4_M16_44 + +strmm_kernel_L4_M16_40: + + INIT16x4 + +strmm_kernel_L4_M16_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M16_100 + +strmm_kernel_L4_M16_46: + + KERNEL16x4_SUB + +strmm_kernel_L4_M16_100: + + SAVE16x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #16 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #16 +#endif + +strmm_kernel_L4_M16_END: + subs counterI, counterI, #1 + bne strmm_kernel_L4_M16_20 + +//------------------------------------------------------------------------------ + +strmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble strmm_kernel_L4_END + + tst counterI, #8 + ble strmm_kernel_L4_M4_BEGIN + +strmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M8_22a + .align 5 + +strmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M8_22 + +strmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_32: + + tst counterL, #1 + ble strmm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_40: + + INIT8x4 + +strmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M8_100 + +strmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +strmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L4_M8_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L4_END + + tst counterI, #4 + ble strmm_kernel_L4_M2_BEGIN + +strmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M4_22a + .align 5 + +strmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M4_22 + +strmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_32: + + tst counterL, #1 + ble strmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_40: + + INIT4x4 + +strmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M4_100 + +strmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +strmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L4_M4_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L4_M1_BEGIN + +strmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M2_40 + +strmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_22 + + +strmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M2_100 + +strmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_42 + +strmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L4_M2_END: + + +strmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L4_END + +strmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M1_40 + +strmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_22 + + +strmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M1_100 + +strmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_42 + +strmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt strmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +strmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble strmm_kernel_L999 + + tst counterJ , #2 + ble strmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +strmm_kernel_L2_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI,#0 + ble strmm_kernel_L2_M8_BEGIN + +strmm_kernel_L2_M16_20: + + INIT16x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #16 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M16_40 + .align 5 + +strmm_kernel_L2_M16_22: + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M16_22 + + +strmm_kernel_L2_M16_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M16_100 + +strmm_kernel_L2_M16_42: + + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M16_42 + +strmm_kernel_L2_M16_100: + + SAVE16x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #16 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #16 +#endif + +strmm_kernel_L2_M16_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L2_M16_20 + +//------------------------------------------------------------------------------ + +strmm_kernel_L2_M8_BEGIN: + mov counterI, origM + tst counterI , #15 + ble strmm_kernel_L2_END + + tst counterI, #8 + ble strmm_kernel_L2_M4_BEGIN + +strmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M8_40 + .align 5 + +strmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_22 + + +strmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M8_100 + +strmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_42 + +strmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L2_M8_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L2_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L2_END + + tst counterI, #4 + ble strmm_kernel_L2_M2_BEGIN + +strmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M4_40 + .align 5 + +strmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_22 + + +strmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M4_100 + +strmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_42 + +strmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L2_M4_END: + +//------------------------------------------------------------------------------ + + +strmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L2_M1_BEGIN + +strmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M2_40 + +strmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_22 + + +strmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M2_100 + +strmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_42 + +strmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L2_M2_END: + + +strmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L2_END + +strmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble strmm_kernel_L2_M1_40 + +strmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_22 + + +strmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M1_100 + +strmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_42 + +strmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ + +strmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble strmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +strmm_kernel_L1_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble strmm_kernel_L1_M8_BEGIN + +strmm_kernel_L1_M16_20: + + INIT16x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #16 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M16_40 + .align 5 + +strmm_kernel_L1_M16_22: + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M16_22 + + +strmm_kernel_L1_M16_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M16_100 + +strmm_kernel_L1_M16_42: + + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M16_42 + +strmm_kernel_L1_M16_100: + + SAVE16x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #16 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #16 +#endif + +strmm_kernel_L1_M16_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L1_M16_20 + +//------------------------------------------------------------------------------ + +strmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble strmm_kernel_L1_END + + tst counterI, #8 + ble strmm_kernel_L1_M4_BEGIN + +strmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M8_40 + .align 5 + +strmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_22 + + +strmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M8_100 + +strmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_42 + +strmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L1_M8_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L1_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L1_END + + tst counterI, #4 + ble strmm_kernel_L1_M2_BEGIN + +strmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M4_40 + .align 5 + +strmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_22 + + +strmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M4_100 + +strmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_42 + +strmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L1_M4_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L1_M1_BEGIN + +strmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M2_40 + +strmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_22 + + +strmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M2_100 + +strmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_42 + +strmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L1_M2_END: + + +strmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L1_END + +strmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M1_40 + +strmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_22 + + +strmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M1_100 + +strmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_42 + +strmm_kernel_L1_M1_100: + + SAVE1x1 + +strmm_kernel_L1_END: + +strmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S new file mode 100755 index 000000000..98b912934 --- /dev/null +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -0,0 +1,2795 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v17.4s, v1.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v19.4s, v1.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v21.4s, v1.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v23.4s, v1.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v25.4s, v1.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v27.4s, v1.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v29.4s, v1.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + fmul v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v26.4s, alphaV2 + fmul v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + fmul v6.4s, v30.4s, alphaV2 + fmul v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + + fmul v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.4s[0] + fmla v18.2s, v0.2s, v4.4s[1] + fmla v20.2s, v0.2s, v4.4s[2] + fmla v22.2s, v0.2s, v4.4s[3] + fmla v24.2s, v0.2s, v5.4s[0] + fmla v26.2s, v0.2s, v5.4s[1] + fmla v28.2s, v0.2s, v5.4s[2] + fmla v30.2s, v0.2s, v5.4s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + + fmul v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + + fmul v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.4s[0] + fmla s18, s0, v4.4s[1] + fmla s20, s0, v4.4s[2] + fmla s22, s0, v4.4s[3] + fmla s24, s0, v5.4s[0] + fmla s26, s0, v5.4s[1] + fmla s28, s0, v5.4s[2] + fmla s30, s0, v5.4s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + + fmul s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s28, alphaV0 + str s4, [pCRow2] + + + fmul s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +strmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble strmm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +strmm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble strmm_kernel_L8_M4_BEGIN + +strmm_kernel_L8_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L8_M8_22a + .align 5 + +strmm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M8_22 + +strmm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_E + + b strmm_kernel_L8_M8_44 + +strmm_kernel_L8_M8_32: + + tst counterL, #1 + ble strmm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_E + + b strmm_kernel_L8_M8_44 + +strmm_kernel_L8_M8_40: + + INIT8x8 + +strmm_kernel_L8_M8_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L8_M8_100 + +strmm_kernel_L8_M8_46: + + KERNEL8x8_SUB + +strmm_kernel_L8_M8_100: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne strmm_kernel_L8_M8_20 + +/******************************************************************************/ + +strmm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L8_END + + tst counterI, #4 + ble strmm_kernel_L8_M2_BEGIN + +strmm_kernel_L8_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L8_M4_22a + .align 5 + +strmm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M4_22 + +strmm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b strmm_kernel_L8_M4_44 + +strmm_kernel_L8_M4_32: + + tst counterL, #1 + ble strmm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b strmm_kernel_L8_M4_44 + +strmm_kernel_L8_M4_40: + + INIT4x8 + +strmm_kernel_L8_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L8_M4_100 + +strmm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +strmm_kernel_L8_M4_100: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L8_M4_END: + +/******************************************************************************/ + +strmm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L8_M1_BEGIN + +strmm_kernel_L8_M2_20: + + INIT2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L8_M2_40 + +strmm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M2_22 + + +strmm_kernel_L8_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L8_M2_100 + +strmm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M2_42 + +strmm_kernel_L8_M2_100: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L8_M2_END: + +/******************************************************************************/ + +strmm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L8_END + +strmm_kernel_L8_M1_20: + + INIT1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L8_M1_40 + +strmm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M1_22 + + +strmm_kernel_L8_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L8_M1_100 + +strmm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M1_42 + +strmm_kernel_L8_M1_100: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +strmm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt strmm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble strmm_kernel_L999 + + tst counterJ , #4 + ble strmm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +/******************************************************************************/ + +strmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble strmm_kernel_L4_M4_BEGIN + +strmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M8_22a + .align 5 + +strmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M8_22 + +strmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_32: + + tst counterL, #1 + ble strmm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_40: + + INIT8x4 + +strmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M8_100 + +strmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +strmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +strmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne strmm_kernel_L4_M8_20 + +/******************************************************************************/ + +strmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L4_END + + tst counterI, #4 + ble strmm_kernel_L4_M2_BEGIN + +strmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M4_22a + .align 5 + +strmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M4_22 + +strmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_32: + + tst counterL, #1 + ble strmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_40: + + INIT4x4 + +strmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M4_100 + +strmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +strmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L4_M4_END: + +/******************************************************************************/ + +strmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L4_M1_BEGIN + +strmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M2_40 + +strmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_22 + + +strmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M2_100 + +strmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_42 + +strmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L4_M2_END: + +/******************************************************************************/ + +strmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L4_END + +strmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M1_40 + +strmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_22 + + +strmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M1_100 + +strmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_42 + +strmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble strmm_kernel_L999 + + tst counterJ , #2 + ble strmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +strmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble strmm_kernel_L2_M4_BEGIN + +strmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M8_40 + .align 5 + +strmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_22 + + +strmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M8_100 + +strmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_42 + +strmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +strmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L2_M8_20 + +/******************************************************************************/ + +strmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L2_END + + tst counterI, #4 + ble strmm_kernel_L2_M2_BEGIN + +strmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M4_40 + .align 5 + +strmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_22 + + +strmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M4_100 + +strmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_42 + +strmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L2_M4_END: + +/******************************************************************************/ + +strmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L2_M1_BEGIN + +strmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M2_40 + +strmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_22 + + +strmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M2_100 + +strmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_42 + +strmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L2_M2_END: + +/******************************************************************************/ + +strmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L2_END + +strmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble strmm_kernel_L2_M1_40 + +strmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_22 + + +strmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M1_100 + +strmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_42 + +strmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble strmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +strmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble strmm_kernel_L1_M4_BEGIN + +strmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M8_40 + .align 5 + +strmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_22 + + +strmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M8_100 + +strmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_42 + +strmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +strmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L1_M8_20 + +/******************************************************************************/ + +strmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L1_END + + tst counterI, #4 + ble strmm_kernel_L1_M2_BEGIN + +strmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M4_40 + .align 5 + +strmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_22 + + +strmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M4_100 + +strmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_42 + +strmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L1_M4_END: + +/******************************************************************************/ + +strmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L1_M1_BEGIN + +strmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M2_40 + +strmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_22 + + +strmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M2_100 + +strmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_42 + +strmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L1_M2_END: + +/******************************************************************************/ + +strmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L1_END + +strmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M1_40 + +strmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_22 + + +strmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M1_100 + +strmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_42 + +strmm_kernel_L1_M1_100: + + SAVE1x1 + +strmm_kernel_L1_END: + +/******************************************************************************/ + +strmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 5ac02f6dc76d1e30d407b0c4ae6ae4efda41b6e3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 14 Mar 2016 19:35:23 +0530 Subject: [PATCH 36/37] Optimize Dgemm 4x4 for Cortex A57 --- kernel/arm64/dgemm_kernel_4x4.S | 439 ++++++++++++++++++-------------- 1 file changed, 254 insertions(+), 185 deletions(-) diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S index e88253af1..e2ad11492 100644 --- a/kernel/arm64/dgemm_kernel_4x4.S +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define ppC x16 -#define ppCRow0 x17 -#define ppCRow1 x18 -#define ppCRow2 x19 -#define ppA x20 +#define pCRow3 x15 +#define pA x16 +#define ppC x17 +#define ppCRow0 x18 +#define ppCRow1 x19 +#define ppCRow2 x20 +#define ppCRow3 x21 +#define ppA x22 +#define alpha x23 #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 1024 +#define B_PRE_SIZE 1024 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 ppC -// 17 ppCRow0 -// 18 must save ppCRow1 -// 19 must save ppCRow2 -// 20 must save ppA -// 21 must save -// 22 must save -// 23 must save +// 15 pCRow3 +// 16 pA +// 17 ppC +// 18 must save ppCRow0 +// 19 must save ppCRow1 +// 20 must save ppCRow2 +// 21 must save ppCRow3 +// 22 must save ppA +// 23 must save alpha // 24 must save // 25 must save // 26 must save @@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v08 must save pB00, pB01 //v09 must save pB02, pB03 //v10 must save ALPHA0 -//v11 must save ALPHA1 +//v11 must save //v12 must save pB10, pB11 //v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v14 must save +//v15 must save //v16 must save C00, C01 //v17 must save C02, C03 //v18 ppC00, ppC01 @@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v0.2d, v1.2d}, [pA] + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + + ldp q0, q1, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v29.2d, v1.2d, v11.2d[0] - ld1 {v2.2d, v3.2d}, [ppA] + ldp q2, q3, [ppA] add ppA, ppA, #32 - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v9.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v18.2d, v2.2d, v8.2d[0] - fmul v31.2d, v3.2d, v9.2d[1] - fmul v22.2d, v2.2d, v8.2d[1] - fmul v27.2d, v3.2d, v9.2d[0] + fmul v31.2d, v3.2d, v11.2d[0] - ld1 {v12.2d, v13.2d}, [pB] // for next round - add pB, pB, #32 + prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v22.2d, v2.2d, v9.2d[0] + fmul v27.2d, v3.2d, v10.2d[0] - ld1 {v4.2d, v5.2d} , [pA] // for next round + ldp d12, d13, [pB] + add pB, pB, #16 + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + + ldp q4, q5, [pA] // for next round add pA, pA, #32 - fmul v26.2d, v2.2d, v9.2d[0] - fmul v23.2d, v3.2d, v8.2d[1] + fmul v26.2d, v2.2d, v10.2d[0] + fmul v23.2d, v3.2d, v9.2d[0] - ld1 {v6.2d, v7.2d} , [ppA] // for next round + ldp q6, q7, [ppA] // for next round add ppA, ppA, #32 - fmul v28.2d, v0.2d, v9.2d[1] + fmul v28.2d, v0.2d, v11.2d[0] fmul v17.2d, v1.2d, v8.2d[0] - fmul v30.2d, v2.2d, v9.2d[1] + + ldp d14, d15, [pB] + add pB, pB, #16 + + fmul v30.2d, v2.2d, v11.2d[0] fmul v19.2d, v3.2d, v8.2d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v29.2d, v5.2d, v15.2d[0] - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 + ldp d8, d9, [pB] + add pB, pB, #16 fmla v18.2d, v6.2d, v12.2d[0] - fmla v31.2d, v7.2d, v13.2d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] - prfm PLDL1KEEP, [pB, #512] + ldp d10, d11, [pB] + add pB, pB, #16 - fmla v22.2d, v6.2d, v12.2d[1] - fmla v27.2d, v7.2d, v13.2d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v20.2d, v4.2d, v13.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] - ld1 {v0.2d, v1.2d}, [pA] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v22.2d, v6.2d, v13.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] + fmla v24.2d, v4.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + + ldp q0, q1, [pA] add pA, pA, #32 - fmla v26.2d, v6.2d, v13.2d[0] - fmla v23.2d, v7.2d, v12.2d[1] - fmla v28.2d, v4.2d, v13.2d[1] + fmla v26.2d, v6.2d, v14.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + fmla v28.2d, v4.2d, v15.2d[0] fmla v17.2d, v5.2d, v12.2d[0] - ld1 {v2.2d, v3.2d}, [ppA] + ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v30.2d, v6.2d, v13.2d[1] + fmla v30.2d, v6.2d, v15.2d[0] fmla v19.2d, v7.2d, v12.2d[0] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v29.2d, v1.2d, v11.2d[0] - ld1 {v12.2d, v13.2d}, [pB] // for next round - add pB, pB, #32 + ldp d12, d13, [pB] + add pB, pB, #16 fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v9.2d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] - prfm PLDL1KEEP, [pA, #512] + ldp d14, d15, [pB] + add pB, pB, #16 - fmla v22.2d, v2.2d, v8.2d[1] - fmla v27.2d, v3.2d, v9.2d[0] + fmla v20.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] - prfm PLDL1KEEP, [ppA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] - ld1 {v4.2d, v5.2d} , [pA] // for next round + prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + + ldp q4, q5, [pA] add pA, pA, #32 - fmla v26.2d, v2.2d, v9.2d[0] - fmla v23.2d, v3.2d, v8.2d[1] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + fmla v28.2d, v0.2d, v11.2d[0] fmla v17.2d, v1.2d, v8.2d[0] - ld1 {v6.2d, v7.2d} , [ppA] // for next round + ldp q6, q7, [ppA] add ppA, ppA, #32 - fmla v30.2d, v2.2d, v9.2d[1] + fmla v30.2d, v2.2d, v11.2d[0] fmla v19.2d, v3.2d, v8.2d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.2d[0] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] fmla v18.2d, v6.2d, v12.2d[0] - fmla v27.2d, v7.2d, v13.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v29.2d, v5.2d, v13.2d[1] - fmla v22.2d, v6.2d, v12.2d[1] - fmla v31.2d, v7.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v22.2d, v6.2d, v13.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] - fmla v24.2d, v4.2d, v13.2d[0] + fmla v24.2d, v4.2d, v14.2d[0] fmla v17.2d, v5.2d, v12.2d[0] - fmla v26.2d, v6.2d, v13.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] fmla v19.2d, v7.2d, v12.2d[0] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v21.2d, v5.2d, v12.2d[1] - fmla v30.2d, v6.2d, v13.2d[1] - fmla v23.2d, v7.2d, v12.2d[1] + fmla v28.2d, v4.2d, v15.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v30.2d, v6.2d, v15.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v0.2d, v1.2d}, [pA] + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + ldp q0, q1, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v20.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] - ld1 {v2.2d, v3.2d}, [ppA] + ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v24.2d, v0.2d, v10.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v28.2d, v0.2d, v11.2d[0] fmla v17.2d, v1.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v9.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v27.2d, v3.2d, v9.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] + fmla v22.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] - fmla v26.2d, v2.2d, v9.2d[0] - fmla v23.2d, v3.2d, v8.2d[1] - fmla v30.2d, v2.2d, v9.2d[1] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] fmla v19.2d, v3.2d, v8.2d[0] .endm .macro SAVE8x4 + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add ppCRow0, pCRow0, #32 - ld1 {v0.2d, v1.2d}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - st1 {v0.2d, v1.2d}, [pCRow0] - - ld1 {v2.2d, v3.2d}, [ppCRow0] - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 - st1 {v2.2d, v3.2d}, [ppCRow0] - - add pCRow1, pCRow0, LDC - add ppCRow1, ppCRow0, LDC - - ld1 {v4.2d, v5.2d}, [pCRow1] - fmla v4.2d, v20.2d, alphaV0 - fmla v5.2d, v21.2d, alphaV1 - st1 {v4.2d, v5.2d}, [pCRow1] - - ld1 {v6.2d, v7.2d}, [ppCRow1] - fmla v6.2d, v22.2d, alphaV2 - fmla v7.2d, v23.2d, alphaV3 - st1 {v6.2d, v7.2d}, [ppCRow1] - - add pCRow2, pCRow1, LDC - add ppCRow2, ppCRow1, LDC - - ld1 {v0.2d, v1.2d}, [pCRow2] - fmla v0.2d, v24.2d, alphaV0 - fmla v1.2d, v25.2d, alphaV1 - st1 {v0.2d, v1.2d}, [pCRow2] - - ld1 {v2.2d, v3.2d}, [ppCRow2] - fmla v2.2d, v26.2d, alphaV2 - fmla v3.2d, v27.2d, alphaV3 - st1 {v2.2d, v3.2d}, [ppCRow2] - - add pCRow1, pCRow2, LDC - add ppCRow1, ppCRow2, LDC - - ld1 {v4.2d, v5.2d}, [pCRow1] - fmla v4.2d, v28.2d, alphaV0 - fmla v5.2d, v29.2d, alphaV1 - st1 {v4.2d, v5.2d}, [pCRow1] - - ld1 {v6.2d, v7.2d}, [ppCRow1] - fmla v6.2d, v30.2d, alphaV2 - fmla v7.2d, v31.2d, alphaV3 - st1 {v6.2d, v7.2d}, [ppCRow1] + fmla v1.2d, v17.2d, alphaV0 + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #64 + + ldp q2, q3, [ppCRow0] + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 + stp q2, q3, [ppCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add ppCRow1, pCRow1, #32 + + ldp q4, q5, [pCRow1] + fmla v4.2d, v20.2d, alphaV0 + fmla v5.2d, v21.2d, alphaV0 + stp q4, q5, [pCRow1] + + add pCRow1, pCRow1, #64 + + ldp q6, q7, [ppCRow1] + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 + stp q6, q7, [ppCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add ppCRow2, pCRow2, #32 + + ldp q0, q1, [pCRow2] + fmla v0.2d, v24.2d, alphaV0 + fmla v1.2d, v25.2d, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #64 + + ldp q2, q3, [ppCRow2] + fmla v2.2d, v26.2d, alphaV0 + fmla v3.2d, v27.2d, alphaV0 + stp q2, q3, [ppCRow2] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add ppCRow3, pCRow3, #32 + + ldp q4, q5, [pCRow3] + fmla v4.2d, v28.2d, alphaV0 + fmla v5.2d, v29.2d, alphaV0 + stp q4, q5, [pCRow3] + + add pCRow3, pCRow3, #64 + + ldp q6, q7, [ppCRow3] + fmla v6.2d, v30.2d, alphaV0 + fmla v7.2d, v31.2d, alphaV0 + stp q6, q7, [ppCRow3] .endm /******************************************************************************/ @@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 - fmla v9.2d, v25.2d, alphaV1 + fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV2 - fmla v13.2d, v29.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1, pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d}, [pCRow2] - fmla v8.2d, v24.2d, alphaV2 + fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1 , pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha + ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] @@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, d0 - fmov alpha1, d0 - fmov alpha2, d0 - fmov alpha3, d0 + fmov alpha, d0 + prfm PLDL1KEEP, [origPA] + prfm PLDL1KEEP, [origPB] lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble dgemm_kernel_L2_BEGIN dgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC lsl temp, origK, #5 // k * 4 * 8 mov pA, origPA // pA = start of A array add ppA, temp, pA + prfm PLDL1KEEP, [ppA] //------------------------------------------------------------------------------ @@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN + .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #2 // L = K / 4 + cmp counterL , #2 blt dgemm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a + .align 5 - dgemm_kernel_L4_M8_22: - + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 - + .align 5 dgemm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 + .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 ble dgemm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 @@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40: dgemm_kernel_L4_M8_44: - ands counterL , origK, #1 + ands counterL , origK, #3 ble dgemm_kernel_L4_M8_100 + .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne dgemm_kernel_L4_M8_46 + dgemm_kernel_L4_M8_100: + lsl temp, origK, #5 + prfm PLDL1KEEP, [pA, temp] + prfm PLDL1KEEP, [ppA, temp] + prfm PLDL1KEEP, [origPB] SAVE8x4 @@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END: subs counterI, counterI, #1 bne dgemm_kernel_L4_M8_20 - dgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 From cf8c7e28b34e97f4511710f87952397a53a5d9b7 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 14 Mar 2016 19:59:41 +0530 Subject: [PATCH 37/37] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index df92cf4ef..da56c0758 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -141,5 +141,11 @@ In chronological order: * Martin Koehler * [2015-09-07] Improved imatcopy +* Ashwin Sekhar T K + * [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8) + * [2015-11-20] lapack-test fixes for Cortex-A57 + * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 + * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes]