diff --git a/.travis.yml b/.travis.yml
index 990bed864..63b469716 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,7 @@ before_install:
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
script:
+ - set -e
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index df92cf4ef..da56c0758 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -141,5 +141,11 @@ In chronological order:
* Martin Koehler
* [2015-09-07] Improved imatcopy
+* Ashwin Sekhar T K
+ * [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
+ * [2015-11-20] lapack-test fixes for Cortex-A57
+ * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
+ * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
+
* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]
diff --git a/Makefile b/Makefile
index 5aa10b2c3..9ba2bffb3 100644
--- a/Makefile
+++ b/Makefile
@@ -83,20 +83,20 @@ shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@$(MAKE) -C exports so
- @-ln -fs $(LIBSONAME) $(LIBPREFIX).so
- @-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+ @ln -fs $(LIBSONAME) $(LIBPREFIX).so
+ @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@$(MAKE) -C exports so
- @-ln -fs $(LIBSONAME) $(LIBPREFIX).so
+ @ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@$(MAKE) -C exports so
- @-ln -fs $(LIBSONAME) $(LIBPREFIX).so
+ @ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
- @-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
+ @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
diff --git a/USAGE.md b/USAGE.md
new file mode 100644
index 000000000..c76ceb324
--- /dev/null
+++ b/USAGE.md
@@ -0,0 +1,199 @@
+# Notes on OpenBLAS usage
+## Usage
+
+#### Program is Terminated. Because you tried to allocate too many memory regions
+
+In OpenBLAS, we mange a pool of memory buffers and allocate the number of
+buffers as the following.
+```
+#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
+```
+This error indicates that the program exceeded the number of buffers.
+
+Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
+NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
+`MAX_CPU_NUMBER=NUM_THREADS`.
+
+#### How can I use OpenBLAS in multi-threaded applications?
+
+If your application is already multi-threaded, it will conflict with OpenBLAS
+multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
+following ways:
+
+* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
+* Call `openblas_set_num_threads(1)` in the application on runtime.
+* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
+
+If the application is parallelized by OpenMP, please use OpenBLAS built with
+`USE_OPENMP=1`
+
+#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
+
+The environment variable which control the kernel selection is
+`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
+OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
+returns the used target.
+
+#### How could I disable OpenBLAS threading affinity on runtime?
+
+You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
+variable to disable threading affinity on runtime. For example, before the
+running,
+```
+export OPENBLAS_MAIN_FREE=1
+```
+
+Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
+in `Makefile.rule`.
+
+## Linking with the library
+
+* Link with shared library
+
+`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
+
+If the library is multithreaded, please add `-lpthread`. If the library
+contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
+
+* Link with static library
+
+`gcc -o test test.c /your/path/libopenblas.a`
+
+You can download `test.c` from https://gist.github.com/xianyi/5780018
+
+On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
+default), custom programs statically linked against `libopenblas.a` should also
+link with the pthread library e.g.:
+
+```
+gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
+```
+
+Failing to add the `-lpthread` flag will cause errors such as:
+
+```
+/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
+memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
+memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
+...
+```
+
+## Code examples
+
+#### Call CBLAS interface
+This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
+```
+#include
+#include
+
+void main()
+{
+ int i=0;
+ double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
+ double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
+ double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
+
+ for(i=0; i<9; i++)
+ printf("%lf ", C[i]);
+ printf("\n");
+}
+```
+`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
+
+#### Call BLAS Fortran interface
+
+This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
+
+```
+#include "stdio.h"
+#include "stdlib.h"
+#include "sys/time.h"
+#include "time.h"
+
+extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
+
+int main(int argc, char* argv[])
+{
+ int i;
+ printf("test!\n");
+ if(argc<4){
+ printf("Input Error\n");
+ return 1;
+ }
+
+ int m = atoi(argv[1]);
+ int n = atoi(argv[2]);
+ int k = atoi(argv[3]);
+ int sizeofa = m * k;
+ int sizeofb = k * n;
+ int sizeofc = m * n;
+ char ta = 'N';
+ char tb = 'N';
+ double alpha = 1.2;
+ double beta = 0.001;
+
+ struct timeval start,finish;
+ double duration;
+
+ double* A = (double*)malloc(sizeof(double) * sizeofa);
+ double* B = (double*)malloc(sizeof(double) * sizeofb);
+ double* C = (double*)malloc(sizeof(double) * sizeofc);
+
+ srand((unsigned)time(NULL));
+
+ for (i=0; i `
+
+## Troubleshooting
+* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
+* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
+* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
+* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
+* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
+* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
+
+## BLAS reference manual
+If you want to understand every BLAS function and definition, please read
+[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
+or [netlib.org](http://netlib.org/blas/)
+
+Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
+
+## How to reference OpenBLAS.
+
+You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
+
+Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.
+
diff --git a/benchmark/Makefile b/benchmark/Makefile
index bcf3da2cc..11d3c5bec 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -2134,7 +2134,7 @@ zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
smallscaling: smallscaling.c ../$(LIBNAME)
- $(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
+ $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c
index daed8f3da..9068c61b1 100644
--- a/benchmark/smallscaling.c
+++ b/benchmark/smallscaling.c
@@ -23,28 +23,32 @@ typedef struct {
void * s_create_matrix(int size) {
float * r = malloc(size * sizeof(double));
- for(int i = 0; i < size; i++)
+ int i;
+ for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * c_create_matrix(int size) {
float * r = malloc(size * 2 * sizeof(double));
- for(int i = 0; i < 2 * size; i++)
+ int i;
+ for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * z_create_matrix(int size) {
double * r = malloc(size * 2 * sizeof(double));
- for(int i = 0; i < 2 * size; i++)
+ int i;
+ for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * d_create_matrix(int size) {
double * r = malloc(size * sizeof(double));
- for(int i = 0; i < size; i++)
+ int i;
+ for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
@@ -188,4 +192,5 @@ int main(int argc, char * argv[]) {
size *= inc_factor;
}
}
+ return(0);
}
diff --git a/common.h b/common.h
index 6b65c37d1..e045e42b2 100644
--- a/common.h
+++ b/common.h
@@ -332,12 +332,13 @@ typedef int blasint;
#endif
#endif
-
+/*
#ifdef PILEDRIVER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
+*/
/*
#ifdef STEAMROLLER
diff --git a/common_power.h b/common_power.h
index ab331b04a..052d38828 100644
--- a/common_power.h
+++ b/common_power.h
@@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
+#if defined(POWER8)
+#define L1_DUALFETCH
+#define L1_PREFETCHSIZE (16 + 128 * 100)
+#define L1_PREFETCH dcbtst
+#endif
+
+#
#ifndef L1_PREFETCH
#define L1_PREFETCH dcbt
#endif
@@ -790,6 +797,8 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
+#elif defined(POWER8)
+#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif
diff --git a/common_x86_64.h b/common_x86_64.h
index da9afc0e4..11937b415 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -396,7 +396,7 @@ REALNAME:
#define PROFCODE
-#define EPILOGUE .end REALNAME
+#define EPILOGUE .end
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
diff --git a/cpuid_power.c b/cpuid_power.c
index 366c6ed08..951204ae9 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -55,6 +55,7 @@
#define CPUTYPE_POWER6 5
#define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7
+#define CPUTYPE_POWER8 8
char *cpuname[] = {
"UNKNOWN",
@@ -65,6 +66,7 @@ char *cpuname[] = {
"POWER6",
"CELL",
"PPCG4",
+ "POWER8"
};
char *lowercpuname[] = {
@@ -76,6 +78,7 @@ char *lowercpuname[] = {
"power6",
"cell",
"ppcg4",
+ "power8"
};
char *corename[] = {
@@ -87,6 +90,7 @@ char *corename[] = {
"POWER6",
"CELL",
"PPCG4",
+ "POWER8"
};
int detect(void){
@@ -115,7 +119,7 @@ int detect(void){
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
- if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
+ if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
diff --git a/ctest/cin2 b/ctest/cin2
index 032fcbb39..b2e1e4a0e 100644
--- a/ctest/cin2
+++ b/ctest/cin2
@@ -1,7 +1,7 @@
'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/cin3 b/ctest/cin3
index 223d165db..fbdb57857 100644
--- a/ctest/cin3
+++ b/ctest/cin3
@@ -1,7 +1,7 @@
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/cin3_3m b/ctest/cin3_3m
index 34014143e..5a797291a 100644
--- a/ctest/cin3_3m
+++ b/ctest/cin3_3m
@@ -1,7 +1,7 @@
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/din2 b/ctest/din2
index 6f42b2792..df8f7b6ae 100644
--- a/ctest/din2
+++ b/ctest/din2
@@ -1,7 +1,7 @@
'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/din3 b/ctest/din3
index cbbcc22ab..23fedfe32 100644
--- a/ctest/din3
+++ b/ctest/din3
@@ -1,7 +1,7 @@
'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/sin2 b/ctest/sin2
index 3eee5c2f9..0e1ecd9d6 100644
--- a/ctest/sin2
+++ b/ctest/sin2
@@ -1,7 +1,7 @@
'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/sin3 b/ctest/sin3
index 01e32d6ee..644083f22 100644
--- a/ctest/sin3
+++ b/ctest/sin3
@@ -1,7 +1,7 @@
'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/zin2 b/ctest/zin2
index 4c0affe92..217697191 100644
--- a/ctest/zin2
+++ b/ctest/zin2
@@ -1,7 +1,7 @@
'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/zin3 b/ctest/zin3
index 70050b693..ee269e8d5 100644
--- a/ctest/zin3
+++ b/ctest/zin3
@@ -1,7 +1,7 @@
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/zin3_3m b/ctest/zin3_3m
index 33bf08353..a0d4fde0a 100644
--- a/ctest/zin3_3m
+++ b/ctest/zin3_3m
@@ -1,7 +1,7 @@
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c
index 92c86aec2..2d5fb7802 100644
--- a/driver/level2/ztrmv_L.c
+++ b/driver/level2/ztrmv_L.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (incb != 1) {
B = buffer;
- gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
+ gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
COPY_K(m, b, incb, buffer, 1);
}
diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt
index b2af55e36..b361f2a97 100644
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -33,6 +33,7 @@ set(COMMON_SOURCES
xerbla.c
openblas_set_num_threads.c
openblas_error_handle.c
+ openblas_env.c
openblas_get_num_procs.c
openblas_get_num_threads.c
)
diff --git a/driver/others/Makefile b/driver/others/Makefile
index ed145cee8..e61ba7bc8 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -1,7 +1,7 @@
TOPDIR = ../..
include ../../Makefile.system
-COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
+COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
@@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
$(CC) $(CFLAGS) -c $< -o $(@F)
+openblas_env.$(SUFFIX) : openblas_env.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index c3bf80173..42cadf4b5 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif
+extern unsigned int openblas_thread_timeout();
+
#ifdef SMP_SERVER
#undef MONITOR
@@ -524,6 +526,7 @@ static int blas_monitor(void *arg){
int blas_thread_init(void){
BLASLONG i;
int ret;
+ int thread_timeout_env;
#ifdef NEED_STACKATTR
pthread_attr_t attr;
#endif
@@ -540,22 +543,12 @@ int blas_thread_init(void){
if (!blas_server_avail){
- env_var_t p;
-
- if (readenv(p,"THREAD_TIMEOUT")) {
- thread_timeout = atoi(p);
- if (thread_timeout < 4) thread_timeout = 4;
- if (thread_timeout > 30) thread_timeout = 30;
- thread_timeout = (1 << thread_timeout);
- }else{
- if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
- thread_timeout = atoi(p);
- if (thread_timeout < 4) thread_timeout = 4;
- if (thread_timeout > 30) thread_timeout = 30;
- thread_timeout = (1 << thread_timeout);
- }
- }
-
+ thread_timeout_env=openblas_thread_timeout();
+ if (thread_timeout_env>0) {
+ if (thread_timeout_env < 4) thread_timeout_env = 4;
+ if (thread_timeout_env > 30) thread_timeout_env = 30;
+ thread_timeout = (1 << thread_timeout_env);
+ }
for(i = 0; i < blas_num_threads - 1; i++){
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index a2b7c7045..2fde07fcc 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -391,7 +391,7 @@ static char *corename[] = {
"Nehalem",
"Athlon",
"Opteron",
- "Opteron(SSE3)",
+ "Opteron_SSE3",
"Barcelona",
"Nano",
"Sandybridge",
diff --git a/driver/others/memory.c b/driver/others/memory.c
index e0761d784..e64781740 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -294,8 +294,11 @@ void openblas_fork_handler()
#endif
}
+extern int openblas_num_threads_env();
+extern int openblas_goto_num_threads_env();
+extern int openblas_omp_num_threads_env();
+
int blas_get_cpu_number(void){
- env_var_t p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
@@ -310,18 +313,18 @@ int blas_get_cpu_number(void){
blas_goto_num = 0;
#ifndef USE_OPENMP
- if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
+ blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
if (blas_goto_num == 0) {
- if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
- if (blas_goto_num < 0) blas_goto_num = 0;
+ blas_goto_num=openblas_goto_num_threads_env();
+ if (blas_goto_num < 0) blas_goto_num = 0;
}
#endif
blas_omp_num = 0;
- if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
+ blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
@@ -1340,6 +1343,7 @@ static void gotoblas_memory_init(void) {
/* Initialization for all function; this function should be called before main */
static int gotoblas_initialized = 0;
+extern void openblas_read_env();
void CONSTRUCTOR gotoblas_init(void) {
@@ -1349,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) {
openblas_fork_handler();
#endif
+ openblas_read_env();
+
#ifdef PROFILE
moncontrol (0);
#endif
diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c
new file mode 100644
index 000000000..64ece9515
--- /dev/null
+++ b/driver/others/openblas_env.c
@@ -0,0 +1,84 @@
+/***************************************************************************
+Copyright (c) 2011-2016, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*****************************************************************************/
+
+#include "common.h"
+
+static int openblas_env_verbose=0;
+static unsigned int openblas_env_thread_timeout=0;
+static int openblas_env_block_factor=0;
+static int openblas_env_openblas_num_threads=0;
+static int openblas_env_goto_num_threads=0;
+static int openblas_env_omp_num_threads=0;
+
+int openblas_verbose() { return openblas_env_verbose;}
+unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
+int openblas_block_factor() { return openblas_env_block_factor;}
+int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
+int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
+int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
+
+void openblas_read_env() {
+ int ret=0;
+ env_var_t p;
+ if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
+ if(ret<0) ret=0;
+ openblas_env_verbose=ret;
+
+ ret=0;
+ if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
+ if(ret<0) ret=0;
+ openblas_env_block_factor=ret;
+
+ ret=0;
+ if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
+ if(ret<0) ret=0;
+ openblas_env_thread_timeout=(unsigned int)ret;
+
+ ret=0;
+ if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
+ if(ret<0) ret=0;
+ openblas_env_openblas_num_threads=ret;
+
+ ret=0;
+ if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
+ if(ret<0) ret=0;
+ openblas_env_goto_num_threads=ret;
+
+ ret=0;
+ if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
+ if(ret<0) ret=0;
+ openblas_env_omp_num_threads=ret;
+
+}
+
+
diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c
index f32a54452..9ac72c15d 100644
--- a/driver/others/openblas_error_handle.c
+++ b/driver/others/openblas_error_handle.c
@@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-int openblas_verbose() {
- int ret=0;
- env_var_t p;
- if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
- if(ret<0) ret=0;
- return ret;
-}
+extern int openblas_verbose();
void openblas_warning(int verbose, const char * msg) {
int current_verbose;
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index d741f2fb9..f4b1a80ad 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -40,6 +40,7 @@
#include
#include "common.h"
+extern int openblas_block_factor();
int get_L2_size(void);
#define DEFAULT_GEMM_P 128
@@ -249,7 +250,6 @@ int get_L2_size(void){
void blas_set_parameter(void){
- env_var_t p;
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
int size = 16;
@@ -468,9 +468,8 @@ void blas_set_parameter(void){
#endif
#endif
-
- if (readenv(p,"GOTO_BLOCK_FACTOR")) {
- factor = atoi(p);
+ factor=openblas_block_factor();
+ if (factor>0) {
if (factor < 10) factor = 10;
if (factor > 200) factor = 200;
diff --git a/getarch.c b/getarch.c
index fb80a4c9b..f9c49e663 100644
--- a/getarch.c
+++ b/getarch.c
@@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER5"
#endif
-#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
+#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER6"
@@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER6"
#endif
+#if defined(FORCE_POWER8)
+#define FORCE
+#define ARCHITECTURE "POWER"
+#define SUBARCHITECTURE "POWER8"
+#define SUBDIRNAME "power"
+#define ARCHCONFIG "-DPOWER8 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \
+ "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
+ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME "power8"
+#define CORENAME "POWER8"
+#endif
+
+
#ifdef FORCE_PPCG4
#define FORCE
#define ARCHITECTURE "POWER"
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 63e675b8d..8e6827424 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif
+ifeq ($(CORE), POWER8)
+USE_TRMM = 1
+endif
+
+
SKERNELOBJS += \
diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57
index 7c8eeeea7..64666f05b 100644
--- a/kernel/arm64/KERNEL.CORTEXA57
+++ b/kernel/arm64/KERNEL.CORTEXA57
@@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
-STRMMKERNEL = strmm_kernel_4x4.S
-DTRMMKERNEL = dtrmm_kernel_4x4.S
-CTRMMKERNEL = ctrmm_kernel_4x4.S
-ZTRMMKERNEL = ztrmm_kernel_4x4.S
-
-SGEMMKERNEL = sgemm_kernel_4x4.S
-SGEMMONCOPY = ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
+endif
+SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
-DGEMMKERNEL = dgemm_kernel_4x4.S
-DGEMMONCOPY = ../generic/gemm_ncopy_4.c
-DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+endif
+DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
-CGEMMKERNEL = cgemm_kernel_4x4.S
-CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
-CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
+CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ = cgemm_incopy.o
+CGEMMITCOPYOBJ = cgemm_itcopy.o
+endif
+CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
-ZGEMMKERNEL = zgemm_kernel_4x4.S
-ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
-ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
+ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ = zgemm_incopy.o
+ZGEMMITCOPYOBJ = zgemm_itcopy.o
+endif
+ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S
new file mode 100755
index 000000000..40b98cee2
--- /dev/null
+++ b/kernel/arm64/cgemm_kernel_8x4.S
@@ -0,0 +1,2044 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define temp x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+
+#define alpha0_R s10
+#define alphaV0_R v10.s[0]
+#define alpha0_I s11
+#define alphaV0_I v11.s[0]
+
+#define alpha1_R s14
+#define alphaV1_R v14.s[0]
+#define alpha1_I s15
+#define alphaV1_I v15.s[0]
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr fmla
+#define OP_ii fmls
+#define OP_ri fmla
+#define OP_ir fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr fmla
+#define OP_ii fmla
+#define OP_ri fmls
+#define OP_ir fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr fmla
+#define OP_ii fmla
+#define OP_ri fmla
+#define OP_ir fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr fmla
+#define OP_ii fmls
+#define OP_ri fmls
+#define OP_ir fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
+//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
+//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
+//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
+//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
+//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
+//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
+//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
+//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R
+//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I
+//v10 must save ALPHA0_R
+//v11 must save ALPHA0_I
+//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R
+//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I
+//v14 must save ALPHA1_R
+//v15 must save ALPHA1_I
+//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
+//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
+//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
+//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
+//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
+//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
+//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
+//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
+//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
+//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
+//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
+//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
+//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
+//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
+//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
+//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
+
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s26, s17
+ fmov s27, s18
+ fmov s28, wzr
+ fmov s29, s16
+ fmov s30, s17
+ fmov s31, s18
+.endm
+
+.macro KERNEL8x4_I
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.4s, v0.4s, v9.4s[0]
+#else
+ fmul v17.4s, v0.4s, v9.4s[0]
+#endif
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ fmul v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v19.16b, v19.16b, v19.16b
+ fmls v19.4s, v2.4s, v9.4s[0]
+#else
+ fmul v19.4s, v2.4s, v9.4s[0]
+#endif
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+
+ fmul v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.4s, v0.4s, v9.4s[1]
+#else
+ fmul v21.4s, v0.4s, v9.4s[1]
+#endif
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ fmul v22.4s, v2.4s, v8.4s[1]
+ OP_ii v22.4s, v3.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v23.16b, v23.16b, v23.16b
+ fmls v23.4s, v2.4s, v9.4s[1]
+#else
+ fmul v23.4s, v2.4s, v9.4s[1]
+#endif
+ OP_ir v23.4s, v3.4s, v8.4s[1]
+
+ fmul v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.4s, v0.4s, v9.4s[2]
+#else
+ fmul v25.4s, v0.4s, v9.4s[2]
+#endif
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ fmul v26.4s, v2.4s, v8.4s[2]
+ OP_ii v26.4s, v3.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v27.16b, v27.16b, v27.16b
+ fmls v27.4s, v2.4s, v9.4s[2]
+#else
+ fmul v27.4s, v2.4s, v9.4s[2]
+#endif
+ OP_ir v27.4s, v3.4s, v8.4s[2]
+
+ fmul v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.4s, v0.4s, v9.4s[3]
+#else
+ fmul v29.4s, v0.4s, v9.4s[3]
+#endif
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ fmul v30.4s, v2.4s, v8.4s[3]
+ OP_ii v30.4s, v3.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v31.16b, v31.16b, v31.16b
+ fmls v31.4s, v2.4s, v9.4s[3]
+#else
+ fmul v31.4s, v2.4s, v9.4s[3]
+#endif
+ OP_ir v31.4s, v3.4s, v8.4s[3]
+
+ ld2 {v12.4s, v13.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M1
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v9.4s[0]
+ OP_ri v19.4s, v2.4s, v9.4s[0]
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ OP_rr v22.4s, v2.4s, v8.4s[1]
+ OP_ii v22.4s, v3.4s, v9.4s[1]
+ OP_ri v23.4s, v2.4s, v9.4s[1]
+ OP_ir v23.4s, v3.4s, v8.4s[1]
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ OP_rr v26.4s, v2.4s, v8.4s[2]
+ OP_ii v26.4s, v3.4s, v9.4s[2]
+ OP_ri v27.4s, v2.4s, v9.4s[2]
+ OP_ir v27.4s, v3.4s, v8.4s[2]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ OP_rr v30.4s, v2.4s, v8.4s[3]
+ OP_ii v30.4s, v3.4s, v9.4s[3]
+ OP_ri v31.4s, v2.4s, v9.4s[3]
+ OP_ir v31.4s, v3.4s, v8.4s[3]
+
+ ld2 {v12.4s, v13.4s}, [pB] // For next round
+ add pB, pB, #32
+ ld2 {v4.4s, v5.4s}, [pA] // For next round
+ add pA, pA, #32
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M2
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ OP_rr v18.4s, v6.4s, v12.4s[0]
+ OP_ii v18.4s, v7.4s, v13.4s[0]
+ OP_ri v19.4s, v6.4s, v13.4s[0]
+ OP_ir v19.4s, v7.4s, v12.4s[0]
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ OP_rr v22.4s, v6.4s, v12.4s[1]
+ OP_ii v22.4s, v7.4s, v13.4s[1]
+ OP_ri v23.4s, v6.4s, v13.4s[1]
+ OP_ir v23.4s, v7.4s, v12.4s[1]
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ OP_rr v26.4s, v6.4s, v12.4s[2]
+ OP_ii v26.4s, v7.4s, v13.4s[2]
+ OP_ri v27.4s, v6.4s, v13.4s[2]
+ OP_ir v27.4s, v7.4s, v12.4s[2]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+
+ OP_rr v30.4s, v6.4s, v12.4s[3]
+ OP_ii v30.4s, v7.4s, v13.4s[3]
+ OP_ri v31.4s, v6.4s, v13.4s[3]
+ OP_ir v31.4s, v7.4s, v12.4s[3]
+
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL8x4_E
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ OP_rr v18.4s, v6.4s, v12.4s[0]
+ OP_ii v18.4s, v7.4s, v13.4s[0]
+ OP_ri v19.4s, v6.4s, v13.4s[0]
+ OP_ir v19.4s, v7.4s, v12.4s[0]
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ OP_rr v22.4s, v6.4s, v12.4s[1]
+ OP_ii v22.4s, v7.4s, v13.4s[1]
+ OP_ri v23.4s, v6.4s, v13.4s[1]
+ OP_ir v23.4s, v7.4s, v12.4s[1]
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ OP_rr v26.4s, v6.4s, v12.4s[2]
+ OP_ii v26.4s, v7.4s, v13.4s[2]
+ OP_ri v27.4s, v6.4s, v13.4s[2]
+ OP_ir v27.4s, v7.4s, v12.4s[2]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+
+ OP_rr v30.4s, v6.4s, v12.4s[3]
+ OP_ii v30.4s, v7.4s, v13.4s[3]
+ OP_ri v31.4s, v6.4s, v13.4s[3]
+ OP_ir v31.4s, v7.4s, v12.4s[3]
+
+.endm
+
+.macro KERNEL8x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v9.4s[0]
+ OP_ri v19.4s, v2.4s, v9.4s[0]
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ OP_rr v22.4s, v2.4s, v8.4s[1]
+ OP_ii v22.4s, v3.4s, v9.4s[1]
+ OP_ri v23.4s, v2.4s, v9.4s[1]
+ OP_ir v23.4s, v3.4s, v8.4s[1]
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ OP_rr v26.4s, v2.4s, v8.4s[2]
+ OP_ii v26.4s, v3.4s, v9.4s[2]
+ OP_ri v27.4s, v2.4s, v9.4s[2]
+ OP_ir v27.4s, v3.4s, v8.4s[2]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ OP_rr v30.4s, v2.4s, v8.4s[3]
+ OP_ii v30.4s, v3.4s, v9.4s[3]
+ OP_ri v31.4s, v2.4s, v9.4s[3]
+ OP_ir v31.4s, v3.4s, v8.4s[3]
+
+.endm
+
+.macro SAVE8x4
+ mov pCRow1, pCRow0
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmla v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+ ld2 {v2.4s, v3.4s}, [pCRow2]
+ fmla v2.4s, v18.4s, alphaV0_R
+ fmls v2.4s, v19.4s, alphaV0_I
+ fmla v3.4s, v18.4s, alphaV1_I
+ fmla v3.4s, v19.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmla v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+ ld2 {v6.4s, v7.4s}, [pCRow2]
+ fmla v6.4s, v22.4s, alphaV0_R
+ fmls v6.4s, v23.4s, alphaV0_I
+ fmla v7.4s, v22.4s, alphaV1_I
+ fmla v7.4s, v23.4s, alphaV1_R
+ st2 {v6.4s, v7.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v24.4s, alphaV0_R
+ fmls v0.4s, v25.4s, alphaV0_I
+ fmla v1.4s, v24.4s, alphaV1_I
+ fmla v1.4s, v25.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+ ld2 {v2.4s, v3.4s}, [pCRow2]
+ fmla v2.4s, v26.4s, alphaV0_R
+ fmls v2.4s, v27.4s, alphaV0_I
+ fmla v3.4s, v26.4s, alphaV1_I
+ fmla v3.4s, v27.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v28.4s, alphaV0_R
+ fmls v4.4s, v29.4s, alphaV0_I
+ fmla v5.4s, v28.4s, alphaV1_I
+ fmla v5.4s, v29.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+ ld2 {v6.4s, v7.4s}, [pCRow2]
+ fmla v6.4s, v30.4s, alphaV0_R
+ fmls v6.4s, v31.4s, alphaV0_I
+ fmla v7.4s, v30.4s, alphaV1_I
+ fmla v7.4s, v31.4s, alphaV1_R
+ st2 {v6.4s, v7.4s}, [pCRow2]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+ fmov s24, s17
+ fmov s25, s16
+ fmov s28, s17
+ fmov s29, s16
+.endm
+
+.macro KERNEL4x4_I
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.4s, v0.4s, v9.4s[0]
+#else
+ fmul v17.4s, v0.4s, v9.4s[0]
+#endif
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ fmul v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.4s, v0.4s, v9.4s[1]
+#else
+ fmul v21.4s, v0.4s, v9.4s[1]
+#endif
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ fmul v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.4s, v0.4s, v9.4s[2]
+#else
+ fmul v25.4s, v0.4s, v9.4s[2]
+#endif
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ fmul v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.4s, v0.4s, v9.4s[3]
+#else
+ fmul v29.4s, v0.4s, v9.4s[3]
+#endif
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ ld2 {v12.4s, v13.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ ld2 {v12.4s, v13.4s}, [pB] // For next round
+ add pB, pB, #32
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ ld2 {v4.4s, v5.4s}, [pA] // For next round
+ add pA, pA, #32
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro KERNEL4x4_M2
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ ld2 {v8.4s, v9.4s}, [pB] // For next round
+ add pB, pB, #32
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ ld2 {v0.4s, v1.4s}, [pA] // For next round
+ add pA, pA, #32
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_E
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro SAVE4x4
+ mov pCRow1, pCRow0
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmla v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmla v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v24.4s, alphaV0_R
+ fmls v0.4s, v25.4s, alphaV0_I
+ fmla v1.4s, v24.4s, alphaV1_I
+ fmla v1.4s, v25.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v28.4s, alphaV0_R
+ fmls v4.4s, v29.4s, alphaV0_I
+ fmla v5.4s, v28.4s, alphaV1_I
+ fmla v5.4s, v29.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+ fmov s24, s16
+ fmov s25, s17
+ fmov s28, s16
+ fmov s29, s17
+.endm
+
+.macro KERNEL2x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ OP_rr v16.2s, v0.2s, v8.4s[0]
+ OP_ii v16.2s, v1.2s, v9.4s[0]
+ OP_ri v17.2s, v0.2s, v9.4s[0]
+ OP_ir v17.2s, v1.2s, v8.4s[0]
+
+ OP_rr v20.2s, v0.2s, v8.4s[1]
+ OP_ii v20.2s, v1.2s, v9.4s[1]
+ OP_ri v21.2s, v0.2s, v9.4s[1]
+ OP_ir v21.2s, v1.2s, v8.4s[1]
+
+ OP_rr v24.2s, v0.2s, v8.4s[2]
+ OP_ii v24.2s, v1.2s, v9.4s[2]
+ OP_ri v25.2s, v0.2s, v9.4s[2]
+ OP_ir v25.2s, v1.2s, v8.4s[2]
+
+ OP_rr v28.2s, v0.2s, v8.4s[3]
+ OP_ii v28.2s, v1.2s, v9.4s[3]
+ OP_ri v29.2s, v0.2s, v9.4s[3]
+ OP_ir v29.2s, v1.2s, v8.4s[3]
+.endm
+
+.macro SAVE2x4
+ mov pCRow1, pCRow0
+
+ ld2 {v0.2s, v1.2s}, [pCRow1]
+ fmla v0.2s, v16.2s, alphaV0_R
+ fmls v0.2s, v17.2s, alphaV0_I
+ fmla v1.2s, v16.2s, alphaV1_I
+ fmla v1.2s, v17.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.2s, v5.2s}, [pCRow1]
+ fmla v4.2s, v20.2s, alphaV0_R
+ fmls v4.2s, v21.2s, alphaV0_I
+ fmla v5.2s, v20.2s, alphaV1_I
+ fmla v5.2s, v21.2s, alphaV1_R
+ st2 {v4.2s, v5.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v0.2s, v1.2s}, [pCRow1]
+ fmla v0.2s, v24.2s, alphaV0_R
+ fmls v0.2s, v25.2s, alphaV0_I
+ fmla v1.2s, v24.2s, alphaV1_I
+ fmla v1.2s, v25.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.2s, v5.2s}, [pCRow1]
+ fmla v4.2s, v28.2s, alphaV0_R
+ fmls v4.2s, v29.2s, alphaV0_I
+ fmla v5.2s, v28.2s, alphaV1_I
+ fmla v5.2s, v29.2s, alphaV1_R
+ st2 {v4.2s, v5.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+ fmov s24, s16
+ fmov s25, s17
+ fmov s28, s16
+ fmov s29, s17
+.endm
+
+.macro KERNEL1x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.s, v1.s}[0], [pA]
+ add pA, pA, #8
+
+ OP_rr s16, s0, v8.4s[0]
+ OP_ii s16, s1, v9.4s[0]
+ OP_ri s17, s0, v9.4s[0]
+ OP_ir s17, s1, v8.4s[0]
+
+ OP_rr s20, s0, v8.4s[1]
+ OP_ii s20, s1, v9.4s[1]
+ OP_ri s21, s0, v9.4s[1]
+ OP_ir s21, s1, v8.4s[1]
+
+ OP_rr s24, s0, v8.4s[2]
+ OP_ii s24, s1, v9.4s[2]
+ OP_ri s25, s0, v9.4s[2]
+ OP_ir s25, s1, v8.4s[2]
+
+ OP_rr s28, s0, v8.4s[3]
+ OP_ii s28, s1, v9.4s[3]
+ OP_ri s29, s0, v9.4s[3]
+ OP_ir s29, s1, v8.4s[3]
+.endm
+
+.macro SAVE1x4
+ mov pCRow1, pCRow0
+
+ ld2 {v0.s, v1.s}[0], [pCRow1]
+ fmla s0, s16, alphaV0_R
+ fmls s0, s17, alphaV0_I
+ fmla s1, s16, alphaV1_I
+ fmla s1, s17, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.s, v5.s}[0], [pCRow1]
+ fmla s4, s20, alphaV0_R
+ fmls s4, s21, alphaV0_I
+ fmla s5, s20, alphaV1_I
+ fmla s5, s21, alphaV1_R
+ st2 {v4.s, v5.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v0.s, v1.s}[0], [pCRow1]
+ fmla s0, s24, alphaV0_R
+ fmls s0, s25, alphaV0_I
+ fmla s1, s24, alphaV1_I
+ fmla s1, s25, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.s, v5.s}[0], [pCRow1]
+ fmla s4, s28, alphaV0_R
+ fmls s4, s29, alphaV0_I
+ fmla s5, s28, alphaV1_I
+ fmla s5, s29, alphaV1_R
+ st2 {v4.s, v5.s}[0], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+.endm
+
+.macro KERNEL8x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.2s[0]
+ OP_ii v16.4s, v1.4s, v9.2s[0]
+ OP_ri v17.4s, v0.4s, v9.2s[0]
+ OP_ir v17.4s, v1.4s, v8.2s[0]
+
+ OP_rr v18.4s, v2.4s, v8.2s[0]
+ OP_ii v18.4s, v3.4s, v9.2s[0]
+ OP_ri v19.4s, v2.4s, v9.2s[0]
+ OP_ir v19.4s, v3.4s, v8.2s[0]
+
+ OP_rr v20.4s, v0.4s, v8.2s[1]
+ OP_ii v20.4s, v1.4s, v9.2s[1]
+ OP_ri v21.4s, v0.4s, v9.2s[1]
+ OP_ir v21.4s, v1.4s, v8.2s[1]
+
+ OP_rr v22.4s, v2.4s, v8.2s[1]
+ OP_ii v22.4s, v3.4s, v9.2s[1]
+ OP_ri v23.4s, v2.4s, v9.2s[1]
+ OP_ir v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+ mov pCRow1, pCRow0
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmla v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+ ld2 {v2.4s, v3.4s}, [pCRow2]
+ fmla v2.4s, v18.4s, alphaV0_R
+ fmls v2.4s, v19.4s, alphaV0_I
+ fmla v3.4s, v18.4s, alphaV1_I
+ fmla v3.4s, v19.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmla v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+ ld2 {v6.4s, v7.4s}, [pCRow2]
+ fmla v6.4s, v22.4s, alphaV0_R
+ fmls v6.4s, v23.4s, alphaV0_I
+ fmla v7.4s, v22.4s, alphaV1_I
+ fmla v7.4s, v23.4s, alphaV1_R
+ st2 {v6.4s, v7.4s}, [pCRow2]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+.endm
+
+.macro KERNEL4x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.2s[0]
+ OP_ii v16.4s, v1.4s, v9.2s[0]
+ OP_ri v17.4s, v0.4s, v9.2s[0]
+ OP_ir v17.4s, v1.4s, v8.2s[0]
+
+ OP_rr v20.4s, v0.4s, v8.2s[1]
+ OP_ii v20.4s, v1.4s, v9.2s[1]
+ OP_ri v21.4s, v0.4s, v9.2s[1]
+ OP_ir v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+ mov pCRow1, pCRow0
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmla v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmla v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+.endm
+
+.macro KERNEL2x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ OP_rr v16.2s, v0.2s, v8.2s[0]
+ OP_ii v16.2s, v1.2s, v9.2s[0]
+ OP_ri v17.2s, v0.2s, v9.2s[0]
+ OP_ir v17.2s, v1.2s, v8.2s[0]
+
+ OP_rr v20.2s, v0.2s, v8.2s[1]
+ OP_ii v20.2s, v1.2s, v9.2s[1]
+ OP_ri v21.2s, v0.2s, v9.2s[1]
+ OP_ir v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+ mov pCRow1, pCRow0
+
+ ld2 {v0.2s, v1.2s}, [pCRow1]
+ fmla v0.2s, v16.2s, alphaV0_R
+ fmls v0.2s, v17.2s, alphaV0_I
+ fmla v1.2s, v16.2s, alphaV1_I
+ fmla v1.2s, v17.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.2s, v5.2s}, [pCRow1]
+ fmla v4.2s, v20.2s, alphaV0_R
+ fmls v4.2s, v21.2s, alphaV0_I
+ fmla v5.2s, v20.2s, alphaV1_I
+ fmla v5.2s, v21.2s, alphaV1_R
+ st2 {v4.2s, v5.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, wzr
+ fmov s21, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.s, v1.s}[0], [pA]
+ add pA, pA, #8
+
+ OP_rr s16, s0, v8.2s[0]
+ OP_ii s16, s1, v9.2s[0]
+ OP_ri s17, s0, v9.2s[0]
+ OP_ir s17, s1, v8.2s[0]
+
+ OP_rr s20, s0, v8.2s[1]
+ OP_ii s20, s1, v9.2s[1]
+ OP_ri s21, s0, v9.2s[1]
+ OP_ir s21, s1, v8.2s[1]
+.endm
+
+.macro SAVE1x2
+ mov pCRow1, pCRow0
+
+ ld2 {v0.s, v1.s}[0], [pCRow1]
+ fmla s0, s16, alphaV0_R
+ fmls s0, s17, alphaV0_I
+ fmla s1, s16, alphaV1_I
+ fmla s1, s17, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+ ld2 {v4.s, v5.s}[0], [pCRow1]
+ fmla s4, s20, alphaV0_R
+ fmls s4, s21, alphaV0_I
+ fmla s5, s20, alphaV1_I
+ fmla s5, s21, alphaV1_R
+ st2 {v4.s, v5.s}[0], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+.endm
+
+.macro KERNEL8x1_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v8.4s[1]
+ OP_ri v17.4s, v0.4s, v8.4s[1]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v8.4s[1]
+ OP_ri v19.4s, v2.4s, v8.4s[1]
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+.endm
+
+.macro SAVE8x1
+ mov pCRow1, pCRow0
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmla v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+
+ ld2 {v2.4s, v3.4s}, [pCRow1]
+ fmla v2.4s, v18.4s, alphaV0_R
+ fmls v2.4s, v19.4s, alphaV0_I
+ fmla v3.4s, v18.4s, alphaV1_I
+ fmla v3.4s, v19.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov s16, wzr
+ fmov s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+ ld2 {v8.s, v9.s}[0], [pB]
+ add pB, pB, #8
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+.endm
+
+.macro SAVE4x1
+ mov pCRow1, pCRow0
+
+ ld2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmla v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ld2 {v8.s, v9.s}[0], [pB]
+ add pB, pB, #8
+ ld2 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro SAVE2x1
+ mov pCRow1, pCRow0
+
+ ld2 {v0.2s, v1.2s}, [pCRow1]
+ fmla v0.2s, v16.2s, alphaV0_R
+ fmls v0.2s, v17.2s, alphaV0_I
+ fmla v1.2s, v16.2s, alphaV1_I
+ fmla v1.2s, v17.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ld2 {v8.s, v9.s}[0], [pB]
+ add pB, pB, #8
+ ld2 {v0.s, v1.s}[0], [pA]
+ add pA, pA, #8
+
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
+.endm
+
+.macro SAVE1x1
+ mov pCRow1, pCRow0
+
+ ld2 {v0.s, v1.s}[0], [pCRow1]
+ fmla s0, s16, alphaV0_R
+ fmls s0, s17, alphaV0_I
+ fmla s1, s16, alphaV1_I
+ fmla s1, s17, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0_R, s0
+ fmov alpha0_I, s1
+ fmov alpha1_R, s0
+ fmov alpha1_I, s1
+
+ lsl LDC, LDC, #3 // ldc = ldc * 8
+
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #2 // J = J / 4
+ cmp counterJ, #0
+ ble cgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+cgemm_kernel_L4_BEGIN:
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #2
+
+ mov pA, origPA // pA = start of A array
+
+cgemm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble cgemm_kernel_L4_M4_BEGIN
+
+cgemm_kernel_L4_M8_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt cgemm_kernel_L4_M8_32
+
+ KERNEL8x4_I // do one in the K
+ KERNEL8x4_M2 // do another in the K
+
+ subs counterL, counterL, #2 // subtract 2
+ ble cgemm_kernel_L4_M8_22a
+ .align 5
+
+cgemm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L4_M8_22
+
+
+cgemm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b cgemm_kernel_L4_M8_44
+
+cgemm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble cgemm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+
+ KERNEL8x4_E
+
+ b cgemm_kernel_L4_M8_44
+
+cgemm_kernel_L4_M8_40:
+
+ INIT8x4
+
+cgemm_kernel_L4_M8_44:
+
+ ands counterL , origK, #1
+ ble cgemm_kernel_L4_M8_100
+
+cgemm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+cgemm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+cgemm_kernel_L4_M8_END:
+ subs counterI, counterI, #1
+ bne cgemm_kernel_L4_M8_20
+
+cgemm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble cgemm_kernel_L4_END
+
+ tst counterI, #4
+ ble cgemm_kernel_L4_M2_BEGIN
+
+
+cgemm_kernel_L4_M4_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt cgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble cgemm_kernel_L4_M4_22a
+ .align 5
+
+
+cgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L4_M4_22
+
+cgemm_kernel_L4_M4_22a:
+ KERNEL4x4_M1
+ KERNEL4x4_E
+ b cgemm_kernel_L4_M4_44
+cgemm_kernel_L4_M4_32:
+ tst counterL, #1
+ ble cgemm_kernel_L4_M4_40
+ KERNEL4x4_I
+ KERNEL4x4_E
+ b cgemm_kernel_L4_M4_44
+cgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+cgemm_kernel_L4_M4_44:
+ ands counterL , origK, #1
+ ble cgemm_kernel_L4_M4_100
+
+cgemm_kernel_L4_M4_46:
+ KERNEL4x4_SUB
+
+cgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+cgemm_kernel_L4_M4_END:
+
+cgemm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble cgemm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble cgemm_kernel_L4_M1_BEGIN
+
+cgemm_kernel_L4_M2_20:
+
+ INIT2x4
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble cgemm_kernel_L4_M2_40
+
+cgemm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L4_M2_22
+
+
+cgemm_kernel_L4_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L4_M2_100
+
+cgemm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L4_M2_42
+
+cgemm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+cgemm_kernel_L4_M2_END:
+
+
+cgemm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble cgemm_kernel_L4_END
+
+cgemm_kernel_L4_M1_20:
+
+ INIT1x4
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble cgemm_kernel_L4_M1_40
+
+cgemm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L4_M1_22
+
+
+cgemm_kernel_L4_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L4_M1_100
+
+cgemm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L4_M1_42
+
+cgemm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+
+cgemm_kernel_L4_END:
+
+ lsl temp, origK, #5
+ add origPB, origPB, temp // B = B + K * 4 * 8
+
+ subs counterJ, counterJ , #1 // j--
+ bgt cgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble cgemm_kernel_L999 // error, N was less than 4?
+
+ tst counterJ , #2
+ ble cgemm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+ mov pA, origPA // pA = A
+
+
+cgemm_kernel_L2_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble cgemm_kernel_L2_M4_BEGIN
+
+cgemm_kernel_L2_M8_20:
+
+ INIT8x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble cgemm_kernel_L2_M8_40
+ .align 5
+
+cgemm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M8_22
+
+
+cgemm_kernel_L2_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L2_M8_100
+
+cgemm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M8_42
+
+cgemm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+cgemm_kernel_L2_M8_END:
+
+ subs counterI, counterI, #1
+ bgt cgemm_kernel_L2_M8_20
+
+cgemm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble cgemm_kernel_L2_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble cgemm_kernel_L2_M2_BEGIN
+
+cgemm_kernel_L2_M4_20:
+
+ INIT4x2
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble cgemm_kernel_L2_M4_40
+ .align 5
+
+cgemm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M4_22
+
+
+cgemm_kernel_L2_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L2_M4_100
+
+cgemm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M4_42
+
+cgemm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+cgemm_kernel_L2_M4_END:
+
+cgemm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble cgemm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble cgemm_kernel_L2_M1_BEGIN
+
+cgemm_kernel_L2_M2_20:
+
+ INIT2x2
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble cgemm_kernel_L2_M2_40
+
+cgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M2_22
+
+
+cgemm_kernel_L2_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L2_M2_100
+
+cgemm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M2_42
+
+cgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+cgemm_kernel_L2_M2_END:
+
+
+cgemm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble cgemm_kernel_L2_END
+
+cgemm_kernel_L2_M1_20:
+
+ INIT1x2
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble cgemm_kernel_L2_M1_40
+
+cgemm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M1_22
+
+
+cgemm_kernel_L2_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L2_M1_100
+
+cgemm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L2_M1_42
+
+cgemm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+
+cgemm_kernel_L2_END:
+ add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
+
+/******************************************************************************/
+
+cgemm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble cgemm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+ mov pA, origPA // pA = A
+
+
+cgemm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble cgemm_kernel_L1_M4_BEGIN
+
+cgemm_kernel_L1_M8_20:
+
+ INIT8x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble cgemm_kernel_L1_M8_40
+ .align 5
+
+cgemm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M8_22
+
+
+cgemm_kernel_L1_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L1_M8_100
+
+cgemm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M8_42
+
+cgemm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+cgemm_kernel_L1_M8_END:
+
+ subs counterI, counterI, #1
+ bgt cgemm_kernel_L1_M8_20
+
+cgemm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble cgemm_kernel_L1_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble cgemm_kernel_L1_M2_BEGIN
+
+
+cgemm_kernel_L1_M4_20:
+
+ INIT4x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble cgemm_kernel_L1_M4_40
+ .align 5
+
+cgemm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M4_22
+
+
+cgemm_kernel_L1_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L1_M4_100
+
+cgemm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M4_42
+
+cgemm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+cgemm_kernel_L1_M4_END:
+
+
+cgemm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble cgemm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble cgemm_kernel_L1_M1_BEGIN
+
+cgemm_kernel_L1_M2_20:
+
+ INIT2x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble cgemm_kernel_L1_M2_40
+
+cgemm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M2_22
+
+
+cgemm_kernel_L1_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L1_M2_100
+
+cgemm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M2_42
+
+cgemm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+cgemm_kernel_L1_M2_END:
+
+
+cgemm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble cgemm_kernel_L1_END
+
+cgemm_kernel_L1_M1_20:
+
+ INIT1x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble cgemm_kernel_L1_M1_40
+
+cgemm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M1_22
+
+
+cgemm_kernel_L1_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble cgemm_kernel_L1_M1_100
+
+cgemm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt cgemm_kernel_L1_M1_42
+
+cgemm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+
+cgemm_kernel_L1_END:
+
+
+cgemm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
new file mode 100755
index 000000000..3131541d4
--- /dev/null
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -0,0 +1,2425 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+#define temp x16
+#define tempOffset x17
+#define tempK x18
+
+#define alpha0_R s10
+#define alphaV0_R v10.s[0]
+#define alpha0_I s11
+#define alphaV0_I v11.s[0]
+
+#define alpha1_R s14
+#define alphaV1_R v14.s[0]
+#define alpha1_I s15
+#define alphaV1_I v15.s[0]
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr fmla
+#define OP_ii fmls
+#define OP_ri fmla
+#define OP_ir fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr fmla
+#define OP_ii fmla
+#define OP_ri fmls
+#define OP_ir fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr fmla
+#define OP_ii fmla
+#define OP_ri fmla
+#define OP_ir fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr fmla
+#define OP_ii fmls
+#define OP_ri fmls
+#define OP_ir fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
+//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
+//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
+//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
+//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
+//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
+//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
+//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
+//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R
+//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I
+//v10 must save ALPHA0_R
+//v11 must save ALPHA0_I
+//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R
+//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I
+//v14 must save ALPHA1_R
+//v15 must save ALPHA1_I
+//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
+//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
+//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
+//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
+//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
+//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
+//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
+//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
+//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
+//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
+//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
+//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
+//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
+//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
+//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
+//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s26, s17
+ fmov s27, s18
+ fmov s28, wzr
+ fmov s29, s16
+ fmov s30, s17
+ fmov s31, s18
+.endm
+
+.macro KERNEL8x4_I
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.4s, v0.4s, v9.4s[0]
+#else
+ fmul v17.4s, v0.4s, v9.4s[0]
+#endif
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ fmul v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v19.16b, v19.16b, v19.16b
+ fmls v19.4s, v2.4s, v9.4s[0]
+#else
+ fmul v19.4s, v2.4s, v9.4s[0]
+#endif
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+
+ fmul v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.4s, v0.4s, v9.4s[1]
+#else
+ fmul v21.4s, v0.4s, v9.4s[1]
+#endif
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ fmul v22.4s, v2.4s, v8.4s[1]
+ OP_ii v22.4s, v3.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v23.16b, v23.16b, v23.16b
+ fmls v23.4s, v2.4s, v9.4s[1]
+#else
+ fmul v23.4s, v2.4s, v9.4s[1]
+#endif
+ OP_ir v23.4s, v3.4s, v8.4s[1]
+
+ fmul v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.4s, v0.4s, v9.4s[2]
+#else
+ fmul v25.4s, v0.4s, v9.4s[2]
+#endif
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ fmul v26.4s, v2.4s, v8.4s[2]
+ OP_ii v26.4s, v3.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v27.16b, v27.16b, v27.16b
+ fmls v27.4s, v2.4s, v9.4s[2]
+#else
+ fmul v27.4s, v2.4s, v9.4s[2]
+#endif
+ OP_ir v27.4s, v3.4s, v8.4s[2]
+
+ fmul v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.4s, v0.4s, v9.4s[3]
+#else
+ fmul v29.4s, v0.4s, v9.4s[3]
+#endif
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ fmul v30.4s, v2.4s, v8.4s[3]
+ OP_ii v30.4s, v3.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v31.16b, v31.16b, v31.16b
+ fmls v31.4s, v2.4s, v9.4s[3]
+#else
+ fmul v31.4s, v2.4s, v9.4s[3]
+#endif
+ OP_ir v31.4s, v3.4s, v8.4s[3]
+
+ ld2 {v12.4s, v13.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M1
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v9.4s[0]
+ OP_ri v19.4s, v2.4s, v9.4s[0]
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ OP_rr v22.4s, v2.4s, v8.4s[1]
+ OP_ii v22.4s, v3.4s, v9.4s[1]
+ OP_ri v23.4s, v2.4s, v9.4s[1]
+ OP_ir v23.4s, v3.4s, v8.4s[1]
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ OP_rr v26.4s, v2.4s, v8.4s[2]
+ OP_ii v26.4s, v3.4s, v9.4s[2]
+ OP_ri v27.4s, v2.4s, v9.4s[2]
+ OP_ir v27.4s, v3.4s, v8.4s[2]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ OP_rr v30.4s, v2.4s, v8.4s[3]
+ OP_ii v30.4s, v3.4s, v9.4s[3]
+ OP_ri v31.4s, v2.4s, v9.4s[3]
+ OP_ir v31.4s, v3.4s, v8.4s[3]
+
+ ld2 {v12.4s, v13.4s}, [pB] // For next round
+ add pB, pB, #32
+ ld2 {v4.4s, v5.4s}, [pA] // For next round
+ add pA, pA, #32
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M2
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ OP_rr v18.4s, v6.4s, v12.4s[0]
+ OP_ii v18.4s, v7.4s, v13.4s[0]
+ OP_ri v19.4s, v6.4s, v13.4s[0]
+ OP_ir v19.4s, v7.4s, v12.4s[0]
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ OP_rr v22.4s, v6.4s, v12.4s[1]
+ OP_ii v22.4s, v7.4s, v13.4s[1]
+ OP_ri v23.4s, v6.4s, v13.4s[1]
+ OP_ir v23.4s, v7.4s, v12.4s[1]
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ OP_rr v26.4s, v6.4s, v12.4s[2]
+ OP_ii v26.4s, v7.4s, v13.4s[2]
+ OP_ri v27.4s, v6.4s, v13.4s[2]
+ OP_ir v27.4s, v7.4s, v12.4s[2]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+
+ OP_rr v30.4s, v6.4s, v12.4s[3]
+ OP_ii v30.4s, v7.4s, v13.4s[3]
+ OP_ri v31.4s, v6.4s, v13.4s[3]
+ OP_ir v31.4s, v7.4s, v12.4s[3]
+
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL8x4_E
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ OP_rr v18.4s, v6.4s, v12.4s[0]
+ OP_ii v18.4s, v7.4s, v13.4s[0]
+ OP_ri v19.4s, v6.4s, v13.4s[0]
+ OP_ir v19.4s, v7.4s, v12.4s[0]
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ OP_rr v22.4s, v6.4s, v12.4s[1]
+ OP_ii v22.4s, v7.4s, v13.4s[1]
+ OP_ri v23.4s, v6.4s, v13.4s[1]
+ OP_ir v23.4s, v7.4s, v12.4s[1]
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ OP_rr v26.4s, v6.4s, v12.4s[2]
+ OP_ii v26.4s, v7.4s, v13.4s[2]
+ OP_ri v27.4s, v6.4s, v13.4s[2]
+ OP_ir v27.4s, v7.4s, v12.4s[2]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+
+ OP_rr v30.4s, v6.4s, v12.4s[3]
+ OP_ii v30.4s, v7.4s, v13.4s[3]
+ OP_ri v31.4s, v6.4s, v13.4s[3]
+ OP_ir v31.4s, v7.4s, v12.4s[3]
+
+.endm
+
+.macro KERNEL8x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v9.4s[0]
+ OP_ri v19.4s, v2.4s, v9.4s[0]
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ OP_rr v22.4s, v2.4s, v8.4s[1]
+ OP_ii v22.4s, v3.4s, v9.4s[1]
+ OP_ri v23.4s, v2.4s, v9.4s[1]
+ OP_ir v23.4s, v3.4s, v8.4s[1]
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ OP_rr v26.4s, v2.4s, v8.4s[2]
+ OP_ii v26.4s, v3.4s, v9.4s[2]
+ OP_ri v27.4s, v2.4s, v9.4s[2]
+ OP_ir v27.4s, v3.4s, v8.4s[2]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ OP_rr v30.4s, v2.4s, v8.4s[3]
+ OP_ii v30.4s, v3.4s, v9.4s[3]
+ OP_ri v31.4s, v2.4s, v9.4s[3]
+ OP_ir v31.4s, v3.4s, v8.4s[3]
+
+.endm
+
+.macro SAVE8x4
+ mov pCRow1, pCRow0
+
+
+ fmul v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmul v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+
+ fmul v2.4s, v18.4s, alphaV0_R
+ fmls v2.4s, v19.4s, alphaV0_I
+ fmul v3.4s, v18.4s, alphaV1_I
+ fmla v3.4s, v19.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmul v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+
+ fmul v6.4s, v22.4s, alphaV0_R
+ fmls v6.4s, v23.4s, alphaV0_I
+ fmul v7.4s, v22.4s, alphaV1_I
+ fmla v7.4s, v23.4s, alphaV1_R
+ st2 {v6.4s, v7.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v0.4s, v24.4s, alphaV0_R
+ fmls v0.4s, v25.4s, alphaV0_I
+ fmul v1.4s, v24.4s, alphaV1_I
+ fmla v1.4s, v25.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+
+ fmul v2.4s, v26.4s, alphaV0_R
+ fmls v2.4s, v27.4s, alphaV0_I
+ fmul v3.4s, v26.4s, alphaV1_I
+ fmla v3.4s, v27.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.4s, v28.4s, alphaV0_R
+ fmls v4.4s, v29.4s, alphaV0_I
+ fmul v5.4s, v28.4s, alphaV1_I
+ fmla v5.4s, v29.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+
+ fmul v6.4s, v30.4s, alphaV0_R
+ fmls v6.4s, v31.4s, alphaV0_I
+ fmul v7.4s, v30.4s, alphaV1_I
+ fmla v7.4s, v31.4s, alphaV1_R
+ st2 {v6.4s, v7.4s}, [pCRow2]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+ fmov s24, s17
+ fmov s25, s16
+ fmov s28, s17
+ fmov s29, s16
+.endm
+
+.macro KERNEL4x4_I
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.4s, v0.4s, v9.4s[0]
+#else
+ fmul v17.4s, v0.4s, v9.4s[0]
+#endif
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ fmul v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.4s, v0.4s, v9.4s[1]
+#else
+ fmul v21.4s, v0.4s, v9.4s[1]
+#endif
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ fmul v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.4s, v0.4s, v9.4s[2]
+#else
+ fmul v25.4s, v0.4s, v9.4s[2]
+#endif
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ fmul v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.4s, v0.4s, v9.4s[3]
+#else
+ fmul v29.4s, v0.4s, v9.4s[3]
+#endif
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+
+ ld2 {v12.4s, v13.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ ld2 {v12.4s, v13.4s}, [pB] // For next round
+ add pB, pB, #32
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ ld2 {v4.4s, v5.4s}, [pA] // For next round
+ add pA, pA, #32
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro KERNEL4x4_M2
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ ld2 {v8.4s, v9.4s}, [pB] // For next round
+ add pB, pB, #32
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ ld2 {v0.4s, v1.4s}, [pA] // For next round
+ add pA, pA, #32
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_E
+ OP_rr v16.4s, v4.4s, v12.4s[0]
+ OP_ii v16.4s, v5.4s, v13.4s[0]
+ OP_ri v17.4s, v4.4s, v13.4s[0]
+ OP_ir v17.4s, v5.4s, v12.4s[0]
+
+ OP_rr v20.4s, v4.4s, v12.4s[1]
+ OP_ii v20.4s, v5.4s, v13.4s[1]
+ OP_ri v21.4s, v4.4s, v13.4s[1]
+ OP_ir v21.4s, v5.4s, v12.4s[1]
+
+ OP_rr v24.4s, v4.4s, v12.4s[2]
+ OP_ii v24.4s, v5.4s, v13.4s[2]
+ OP_ri v25.4s, v4.4s, v13.4s[2]
+ OP_ir v25.4s, v5.4s, v12.4s[2]
+
+ OP_rr v28.4s, v4.4s, v12.4s[3]
+ OP_ii v28.4s, v5.4s, v13.4s[3]
+ OP_ri v29.4s, v4.4s, v13.4s[3]
+ OP_ir v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v9.4s[0]
+ OP_ri v17.4s, v0.4s, v9.4s[0]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v20.4s, v0.4s, v8.4s[1]
+ OP_ii v20.4s, v1.4s, v9.4s[1]
+ OP_ri v21.4s, v0.4s, v9.4s[1]
+ OP_ir v21.4s, v1.4s, v8.4s[1]
+
+ OP_rr v24.4s, v0.4s, v8.4s[2]
+ OP_ii v24.4s, v1.4s, v9.4s[2]
+ OP_ri v25.4s, v0.4s, v9.4s[2]
+ OP_ir v25.4s, v1.4s, v8.4s[2]
+
+ OP_rr v28.4s, v0.4s, v8.4s[3]
+ OP_ii v28.4s, v1.4s, v9.4s[3]
+ OP_ri v29.4s, v0.4s, v9.4s[3]
+ OP_ir v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro SAVE4x4
+ mov pCRow1, pCRow0
+
+
+ fmul v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmul v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmul v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v0.4s, v24.4s, alphaV0_R
+ fmls v0.4s, v25.4s, alphaV0_I
+ fmul v1.4s, v24.4s, alphaV1_I
+ fmla v1.4s, v25.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.4s, v28.4s, alphaV0_R
+ fmls v4.4s, v29.4s, alphaV0_I
+ fmul v5.4s, v28.4s, alphaV1_I
+ fmla v5.4s, v29.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+ fmov s24, s16
+ fmov s25, s17
+ fmov s28, s16
+ fmov s29, s17
+.endm
+
+.macro KERNEL2x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ OP_rr v16.2s, v0.2s, v8.4s[0]
+ OP_ii v16.2s, v1.2s, v9.4s[0]
+ OP_ri v17.2s, v0.2s, v9.4s[0]
+ OP_ir v17.2s, v1.2s, v8.4s[0]
+
+ OP_rr v20.2s, v0.2s, v8.4s[1]
+ OP_ii v20.2s, v1.2s, v9.4s[1]
+ OP_ri v21.2s, v0.2s, v9.4s[1]
+ OP_ir v21.2s, v1.2s, v8.4s[1]
+
+ OP_rr v24.2s, v0.2s, v8.4s[2]
+ OP_ii v24.2s, v1.2s, v9.4s[2]
+ OP_ri v25.2s, v0.2s, v9.4s[2]
+ OP_ir v25.2s, v1.2s, v8.4s[2]
+
+ OP_rr v28.2s, v0.2s, v8.4s[3]
+ OP_ii v28.2s, v1.2s, v9.4s[3]
+ OP_ri v29.2s, v0.2s, v9.4s[3]
+ OP_ir v29.2s, v1.2s, v8.4s[3]
+.endm
+
+.macro SAVE2x4
+ mov pCRow1, pCRow0
+
+
+ fmul v0.2s, v16.2s, alphaV0_R
+ fmls v0.2s, v17.2s, alphaV0_I
+ fmul v1.2s, v16.2s, alphaV1_I
+ fmla v1.2s, v17.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.2s, v20.2s, alphaV0_R
+ fmls v4.2s, v21.2s, alphaV0_I
+ fmul v5.2s, v20.2s, alphaV1_I
+ fmla v5.2s, v21.2s, alphaV1_R
+ st2 {v4.2s, v5.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v0.2s, v24.2s, alphaV0_R
+ fmls v0.2s, v25.2s, alphaV0_I
+ fmul v1.2s, v24.2s, alphaV1_I
+ fmla v1.2s, v25.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.2s, v28.2s, alphaV0_R
+ fmls v4.2s, v29.2s, alphaV0_I
+ fmul v5.2s, v28.2s, alphaV1_I
+ fmla v5.2s, v29.2s, alphaV1_R
+ st2 {v4.2s, v5.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+ fmov s24, s16
+ fmov s25, s17
+ fmov s28, s16
+ fmov s29, s17
+.endm
+
+.macro KERNEL1x4_SUB
+ ld2 {v8.4s, v9.4s}, [pB]
+ add pB, pB, #32
+ ld2 {v0.s, v1.s}[0], [pA]
+ add pA, pA, #8
+
+ OP_rr s16, s0, v8.4s[0]
+ OP_ii s16, s1, v9.4s[0]
+ OP_ri s17, s0, v9.4s[0]
+ OP_ir s17, s1, v8.4s[0]
+
+ OP_rr s20, s0, v8.4s[1]
+ OP_ii s20, s1, v9.4s[1]
+ OP_ri s21, s0, v9.4s[1]
+ OP_ir s21, s1, v8.4s[1]
+
+ OP_rr s24, s0, v8.4s[2]
+ OP_ii s24, s1, v9.4s[2]
+ OP_ri s25, s0, v9.4s[2]
+ OP_ir s25, s1, v8.4s[2]
+
+ OP_rr s28, s0, v8.4s[3]
+ OP_ii s28, s1, v9.4s[3]
+ OP_ri s29, s0, v9.4s[3]
+ OP_ir s29, s1, v8.4s[3]
+.endm
+
+.macro SAVE1x4
+ mov pCRow1, pCRow0
+
+
+ fmul s0, s16, alphaV0_R
+ fmls s0, s17, alphaV0_I
+ fmul s1, s16, alphaV1_I
+ fmla s1, s17, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul s4, s20, alphaV0_R
+ fmls s4, s21, alphaV0_I
+ fmul s5, s20, alphaV1_I
+ fmla s5, s21, alphaV1_R
+ st2 {v4.s, v5.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul s0, s24, alphaV0_R
+ fmls s0, s25, alphaV0_I
+ fmul s1, s24, alphaV1_I
+ fmla s1, s25, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul s4, s28, alphaV0_R
+ fmls s4, s29, alphaV0_I
+ fmul s5, s28, alphaV1_I
+ fmla s5, s29, alphaV1_R
+ st2 {v4.s, v5.s}[0], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+.endm
+
+.macro KERNEL8x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.2s[0]
+ OP_ii v16.4s, v1.4s, v9.2s[0]
+ OP_ri v17.4s, v0.4s, v9.2s[0]
+ OP_ir v17.4s, v1.4s, v8.2s[0]
+
+ OP_rr v18.4s, v2.4s, v8.2s[0]
+ OP_ii v18.4s, v3.4s, v9.2s[0]
+ OP_ri v19.4s, v2.4s, v9.2s[0]
+ OP_ir v19.4s, v3.4s, v8.2s[0]
+
+ OP_rr v20.4s, v0.4s, v8.2s[1]
+ OP_ii v20.4s, v1.4s, v9.2s[1]
+ OP_ri v21.4s, v0.4s, v9.2s[1]
+ OP_ir v21.4s, v1.4s, v8.2s[1]
+
+ OP_rr v22.4s, v2.4s, v8.2s[1]
+ OP_ii v22.4s, v3.4s, v9.2s[1]
+ OP_ri v23.4s, v2.4s, v9.2s[1]
+ OP_ir v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+ mov pCRow1, pCRow0
+
+
+ fmul v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmul v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+
+ fmul v2.4s, v18.4s, alphaV0_R
+ fmls v2.4s, v19.4s, alphaV0_I
+ fmul v3.4s, v18.4s, alphaV1_I
+ fmla v3.4s, v19.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow2]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmul v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow2, pCRow1, #32
+
+
+ fmul v6.4s, v22.4s, alphaV0_R
+ fmls v6.4s, v23.4s, alphaV0_I
+ fmul v7.4s, v22.4s, alphaV1_I
+ fmla v7.4s, v23.4s, alphaV1_R
+ st2 {v6.4s, v7.4s}, [pCRow2]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+.endm
+
+.macro KERNEL4x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.2s[0]
+ OP_ii v16.4s, v1.4s, v9.2s[0]
+ OP_ri v17.4s, v0.4s, v9.2s[0]
+ OP_ir v17.4s, v1.4s, v8.2s[0]
+
+ OP_rr v20.4s, v0.4s, v8.2s[1]
+ OP_ii v20.4s, v1.4s, v9.2s[1]
+ OP_ri v21.4s, v0.4s, v9.2s[1]
+ OP_ir v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+ mov pCRow1, pCRow0
+
+
+ fmul v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmul v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.4s, v20.4s, alphaV0_R
+ fmls v4.4s, v21.4s, alphaV0_I
+ fmul v5.4s, v20.4s, alphaV1_I
+ fmla v5.4s, v21.4s, alphaV1_R
+ st2 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, s16
+ fmov s21, s17
+.endm
+
+.macro KERNEL2x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ OP_rr v16.2s, v0.2s, v8.2s[0]
+ OP_ii v16.2s, v1.2s, v9.2s[0]
+ OP_ri v17.2s, v0.2s, v9.2s[0]
+ OP_ir v17.2s, v1.2s, v8.2s[0]
+
+ OP_rr v20.2s, v0.2s, v8.2s[1]
+ OP_ii v20.2s, v1.2s, v9.2s[1]
+ OP_ri v21.2s, v0.2s, v9.2s[1]
+ OP_ir v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+ mov pCRow1, pCRow0
+
+
+ fmul v0.2s, v16.2s, alphaV0_R
+ fmls v0.2s, v17.2s, alphaV0_I
+ fmul v1.2s, v16.2s, alphaV1_I
+ fmla v1.2s, v17.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul v4.2s, v20.2s, alphaV0_R
+ fmls v4.2s, v21.2s, alphaV0_I
+ fmul v5.2s, v20.2s, alphaV1_I
+ fmla v5.2s, v21.2s, alphaV1_R
+ st2 {v4.2s, v5.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, wzr
+ fmov s21, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld2 {v0.s, v1.s}[0], [pA]
+ add pA, pA, #8
+
+ OP_rr s16, s0, v8.2s[0]
+ OP_ii s16, s1, v9.2s[0]
+ OP_ri s17, s0, v9.2s[0]
+ OP_ir s17, s1, v8.2s[0]
+
+ OP_rr s20, s0, v8.2s[1]
+ OP_ii s20, s1, v9.2s[1]
+ OP_ri s21, s0, v9.2s[1]
+ OP_ir s21, s1, v8.2s[1]
+.endm
+
+.macro SAVE1x2
+ mov pCRow1, pCRow0
+
+
+ fmul s0, s16, alphaV0_R
+ fmls s0, s17, alphaV0_I
+ fmul s1, s16, alphaV1_I
+ fmla s1, s17, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow1, pCRow1, LDC
+
+
+ fmul s4, s20, alphaV0_R
+ fmls s4, s21, alphaV0_I
+ fmul s5, s20, alphaV1_I
+ fmla s5, s21, alphaV1_R
+ st2 {v4.s, v5.s}[0], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+.endm
+
+.macro KERNEL8x1_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.4s[0]
+ OP_ii v16.4s, v1.4s, v8.4s[1]
+ OP_ri v17.4s, v0.4s, v8.4s[1]
+ OP_ir v17.4s, v1.4s, v8.4s[0]
+
+ OP_rr v18.4s, v2.4s, v8.4s[0]
+ OP_ii v18.4s, v3.4s, v8.4s[1]
+ OP_ri v19.4s, v2.4s, v8.4s[1]
+ OP_ir v19.4s, v3.4s, v8.4s[0]
+.endm
+
+.macro SAVE8x1
+ mov pCRow1, pCRow0
+
+
+ fmul v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmul v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+
+
+ fmul v2.4s, v18.4s, alphaV0_R
+ fmls v2.4s, v19.4s, alphaV0_I
+ fmul v3.4s, v18.4s, alphaV1_I
+ fmla v3.4s, v19.4s, alphaV1_R
+ st2 {v2.4s, v3.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov s16, wzr
+ fmov s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+ ld2 {v8.s, v9.s}[0], [pB]
+ add pB, pB, #8
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+.endm
+
+.macro SAVE4x1
+ mov pCRow1, pCRow0
+
+
+ fmul v0.4s, v16.4s, alphaV0_R
+ fmls v0.4s, v17.4s, alphaV0_I
+ fmul v1.4s, v16.4s, alphaV1_I
+ fmla v1.4s, v17.4s, alphaV1_R
+ st2 {v0.4s, v1.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ld2 {v8.s, v9.s}[0], [pB]
+ add pB, pB, #8
+ ld2 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro SAVE2x1
+ mov pCRow1, pCRow0
+
+
+ fmul v0.2s, v16.2s, alphaV0_R
+ fmls v0.2s, v17.2s, alphaV0_I
+ fmul v1.2s, v16.2s, alphaV1_I
+ fmla v1.2s, v17.2s, alphaV1_R
+ st2 {v0.2s, v1.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ld2 {v8.s, v9.s}[0], [pB]
+ add pB, pB, #8
+ ld2 {v0.s, v1.s}[0], [pA]
+ add pA, pA, #8
+
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
+.endm
+
+.macro SAVE1x1
+ mov pCRow1, pCRow0
+
+
+ fmul s0, s16, alphaV0_R
+ fmls s0, s17, alphaV0_I
+ fmul s1, s16, alphaV1_I
+ fmla s1, s17, alphaV1_R
+ st2 {v0.s, v1.s}[0], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0_R, s0
+ fmov alpha0_I, s1
+ fmov alpha1_R, s0
+ fmov alpha1_I, s1
+
+ lsl LDC, LDC, #3 // ldc = ldc * 8
+
+#if !defined(LEFT)
+ neg tempOffset, offset
+#endif
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #2 // J = J / 4
+ cmp counterJ, #0
+ ble ctrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+ctrmm_kernel_L4_BEGIN:
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = start of A array
+
+ctrmm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble ctrmm_kernel_L4_M4_BEGIN
+
+ctrmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt ctrmm_kernel_L4_M8_32
+
+ KERNEL8x4_I // do one in the K
+ KERNEL8x4_M2 // do another in the K
+
+ subs counterL, counterL, #2 // subtract 2
+ ble ctrmm_kernel_L4_M8_22a
+ .align 5
+
+ctrmm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L4_M8_22
+
+
+ctrmm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b ctrmm_kernel_L4_M8_44
+
+ctrmm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble ctrmm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+
+ KERNEL8x4_E
+
+ b ctrmm_kernel_L4_M8_44
+
+ctrmm_kernel_L4_M8_40:
+
+ INIT8x4
+
+ctrmm_kernel_L4_M8_44:
+
+ ands counterL , tempK, #1
+ ble ctrmm_kernel_L4_M8_100
+
+ctrmm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+ctrmm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+ctrmm_kernel_L4_M8_END:
+ subs counterI, counterI, #1
+ bne ctrmm_kernel_L4_M8_20
+
+ctrmm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble ctrmm_kernel_L4_END
+
+ tst counterI, #4
+ ble ctrmm_kernel_L4_M2_BEGIN
+
+ctrmm_kernel_L4_M4_20:
+
+ INIT4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble ctrmm_kernel_L4_M4_40
+
+ctrmm_kernel_L4_M4_22:
+
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L4_M4_22
+
+
+ctrmm_kernel_L4_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L4_M4_100
+
+ctrmm_kernel_L4_M4_42:
+
+ KERNEL4x4_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L4_M4_42
+
+ctrmm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+ctrmm_kernel_L4_M4_END:
+
+
+ctrmm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble ctrmm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble ctrmm_kernel_L4_M1_BEGIN
+
+ctrmm_kernel_L4_M2_20:
+
+ INIT2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble ctrmm_kernel_L4_M2_40
+
+ctrmm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L4_M2_22
+
+
+ctrmm_kernel_L4_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L4_M2_100
+
+ctrmm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L4_M2_42
+
+ctrmm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+ctrmm_kernel_L4_M2_END:
+
+
+ctrmm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble ctrmm_kernel_L4_END
+
+ctrmm_kernel_L4_M1_20:
+
+ INIT1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble ctrmm_kernel_L4_M1_40
+
+ctrmm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L4_M1_22
+
+
+ctrmm_kernel_L4_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L4_M1_100
+
+ctrmm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L4_M1_42
+
+ctrmm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+
+ctrmm_kernel_L4_END:
+
+ lsl temp, origK, #5
+ add origPB, origPB, temp // B = B + K * 4 * 8
+
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+ subs counterJ, counterJ , #1 // j--
+ bgt ctrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble ctrmm_kernel_L999 // error, N was less than 4?
+
+ tst counterJ , #2
+ ble ctrmm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+ctrmm_kernel_L2_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble ctrmm_kernel_L2_M4_BEGIN
+
+ctrmm_kernel_L2_M8_20:
+
+ INIT8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble ctrmm_kernel_L2_M8_40
+ .align 5
+
+ctrmm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M8_22
+
+
+ctrmm_kernel_L2_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L2_M8_100
+
+ctrmm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M8_42
+
+ctrmm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+ctrmm_kernel_L2_M8_END:
+
+ subs counterI, counterI, #1
+ bgt ctrmm_kernel_L2_M8_20
+
+ctrmm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble ctrmm_kernel_L2_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble ctrmm_kernel_L2_M2_BEGIN
+
+ctrmm_kernel_L2_M4_20:
+
+ INIT4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble ctrmm_kernel_L2_M4_40
+ .align 5
+
+ctrmm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M4_22
+
+
+ctrmm_kernel_L2_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L2_M4_100
+
+ctrmm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M4_42
+
+ctrmm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+ctrmm_kernel_L2_M4_END:
+
+
+ctrmm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble ctrmm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble ctrmm_kernel_L2_M1_BEGIN
+
+ctrmm_kernel_L2_M2_20:
+
+ INIT2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble ctrmm_kernel_L2_M2_40
+
+ctrmm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M2_22
+
+
+ctrmm_kernel_L2_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L2_M2_100
+
+ctrmm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M2_42
+
+ctrmm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+ctrmm_kernel_L2_M2_END:
+
+
+ctrmm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble ctrmm_kernel_L2_END
+
+ctrmm_kernel_L2_M1_20:
+
+ INIT1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble ctrmm_kernel_L2_M1_40
+
+ctrmm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M1_22
+
+
+ctrmm_kernel_L2_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L2_M1_100
+
+ctrmm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L2_M1_42
+
+ctrmm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+
+ctrmm_kernel_L2_END:
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+ add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
+
+/******************************************************************************/
+
+ctrmm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble ctrmm_kernel_L999 // done
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+ctrmm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble ctrmm_kernel_L1_M4_BEGIN
+
+ctrmm_kernel_L1_M8_20:
+
+ INIT8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble ctrmm_kernel_L1_M8_40
+ .align 5
+
+ctrmm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M8_22
+
+
+ctrmm_kernel_L1_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L1_M8_100
+
+ctrmm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M8_42
+
+ctrmm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+ctrmm_kernel_L1_M8_END:
+
+ subs counterI, counterI, #1
+ bgt ctrmm_kernel_L1_M8_20
+
+ctrmm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble ctrmm_kernel_L1_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble ctrmm_kernel_L1_M2_BEGIN
+
+ctrmm_kernel_L1_M4_20:
+
+ INIT4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble ctrmm_kernel_L1_M4_40
+ .align 5
+
+ctrmm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M4_22
+
+
+ctrmm_kernel_L1_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L1_M4_100
+
+ctrmm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M4_42
+
+ctrmm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+ctrmm_kernel_L1_M4_END:
+
+ctrmm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble ctrmm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble ctrmm_kernel_L1_M1_BEGIN
+
+ctrmm_kernel_L1_M2_20:
+
+ INIT2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble ctrmm_kernel_L1_M2_40
+
+ctrmm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M2_22
+
+
+ctrmm_kernel_L1_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L1_M2_100
+
+ctrmm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M2_42
+
+ctrmm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+ctrmm_kernel_L1_M2_END:
+
+
+ctrmm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble ctrmm_kernel_L1_END
+
+ctrmm_kernel_L1_M1_20:
+
+ INIT1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble ctrmm_kernel_L1_M1_40
+
+ctrmm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M1_22
+
+
+ctrmm_kernel_L1_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble ctrmm_kernel_L1_M1_100
+
+ctrmm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt ctrmm_kernel_L1_M1_42
+
+ctrmm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+
+ctrmm_kernel_L1_END:
+
+
+ctrmm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S
index e88253af1..e2ad11492 100644
--- a/kernel/arm64/dgemm_kernel_4x4.S
+++ b/kernel/arm64/dgemm_kernel_4x4.S
@@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
-#define pA x15
-#define ppC x16
-#define ppCRow0 x17
-#define ppCRow1 x18
-#define ppCRow2 x19
-#define ppA x20
+#define pCRow3 x15
+#define pA x16
+#define ppC x17
+#define ppCRow0 x18
+#define ppCRow1 x19
+#define ppCRow2 x20
+#define ppCRow3 x21
+#define ppA x22
+#define alpha x23
#define alpha0 d10
#define alphaV0 v10.d[0]
-#define alpha1 d11
-#define alphaV1 v11.d[0]
-#define alpha2 d14
-#define alphaV2 v14.d[0]
-#define alpha3 d15
-#define alphaV3 v15.d[0]
+
+#define A_PRE_SIZE 1024
+#define B_PRE_SIZE 1024
+#define C_PRE_SIZE 128
// 00 origM
// 01 origN
@@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
-// 15 pA
-// 16 ppC
-// 17 ppCRow0
-// 18 must save ppCRow1
-// 19 must save ppCRow2
-// 20 must save ppA
-// 21 must save
-// 22 must save
-// 23 must save
+// 15 pCRow3
+// 16 pA
+// 17 ppC
+// 18 must save ppCRow0
+// 19 must save ppCRow1
+// 20 must save ppCRow2
+// 21 must save ppCRow3
+// 22 must save ppA
+// 23 must save alpha
// 24 must save
// 25 must save
// 26 must save
@@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v08 must save pB00, pB01
//v09 must save pB02, pB03
//v10 must save ALPHA0
-//v11 must save ALPHA1
+//v11 must save
//v12 must save pB10, pB11
//v13 must save pB12, pB13
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v14 must save
+//v15 must save
//v16 must save C00, C01
//v17 must save C02, C03
//v18 ppC00, ppC01
@@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
- ld1 {v8.2d, v9.2d}, [pB]
- add pB, pB, #32
- ld1 {v0.2d, v1.2d}, [pA]
+ ldp d8, d9, [pB]
+ add pB, pB, #16
+ ldp d10, d11, [pB]
+ add pB, pB, #16
+
+ ldp q0, q1, [pA]
add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v29.2d, v1.2d, v11.2d[0]
- ld1 {v2.2d, v3.2d}, [ppA]
+ ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v20.2d, v0.2d, v9.2d[0]
+ fmul v25.2d, v1.2d, v10.2d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v18.2d, v2.2d, v8.2d[0]
- fmul v31.2d, v3.2d, v9.2d[1]
- fmul v22.2d, v2.2d, v8.2d[1]
- fmul v27.2d, v3.2d, v9.2d[0]
+ fmul v31.2d, v3.2d, v11.2d[0]
- ld1 {v12.2d, v13.2d}, [pB] // for next round
- add pB, pB, #32
+ prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v22.2d, v2.2d, v9.2d[0]
+ fmul v27.2d, v3.2d, v10.2d[0]
- ld1 {v4.2d, v5.2d} , [pA] // for next round
+ ldp d12, d13, [pB]
+ add pB, pB, #16
+
+ fmul v24.2d, v0.2d, v10.2d[0]
+ fmul v21.2d, v1.2d, v9.2d[0]
+
+ ldp q4, q5, [pA] // for next round
add pA, pA, #32
- fmul v26.2d, v2.2d, v9.2d[0]
- fmul v23.2d, v3.2d, v8.2d[1]
+ fmul v26.2d, v2.2d, v10.2d[0]
+ fmul v23.2d, v3.2d, v9.2d[0]
- ld1 {v6.2d, v7.2d} , [ppA] // for next round
+ ldp q6, q7, [ppA] // for next round
add ppA, ppA, #32
- fmul v28.2d, v0.2d, v9.2d[1]
+ fmul v28.2d, v0.2d, v11.2d[0]
fmul v17.2d, v1.2d, v8.2d[0]
- fmul v30.2d, v2.2d, v9.2d[1]
+
+ ldp d14, d15, [pB]
+ add pB, pB, #16
+
+ fmul v30.2d, v2.2d, v11.2d[0]
fmul v19.2d, v3.2d, v8.2d[0]
.endm
.macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v29.2d, v5.2d, v15.2d[0]
- ld1 {v8.2d, v9.2d}, [pB]
- add pB, pB, #32
+ ldp d8, d9, [pB]
+ add pB, pB, #16
fmla v18.2d, v6.2d, v12.2d[0]
- fmla v31.2d, v7.2d, v13.2d[1]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v31.2d, v7.2d, v15.2d[0]
- prfm PLDL1KEEP, [pB, #512]
+ ldp d10, d11, [pB]
+ add pB, pB, #16
- fmla v22.2d, v6.2d, v12.2d[1]
- fmla v27.2d, v7.2d, v13.2d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
- ld1 {v0.2d, v1.2d}, [pA]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+ fmla v22.2d, v6.2d, v13.2d[0]
+ fmla v27.2d, v7.2d, v14.2d[0]
+ fmla v24.2d, v4.2d, v14.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+
+ ldp q0, q1, [pA]
add pA, pA, #32
- fmla v26.2d, v6.2d, v13.2d[0]
- fmla v23.2d, v7.2d, v12.2d[1]
- fmla v28.2d, v4.2d, v13.2d[1]
+ fmla v26.2d, v6.2d, v14.2d[0]
+ fmla v23.2d, v7.2d, v13.2d[0]
+ fmla v28.2d, v4.2d, v15.2d[0]
fmla v17.2d, v5.2d, v12.2d[0]
- ld1 {v2.2d, v3.2d}, [ppA]
+ ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmla v30.2d, v6.2d, v13.2d[1]
+ fmla v30.2d, v6.2d, v15.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
.endm
.macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v29.2d, v1.2d, v11.2d[0]
- ld1 {v12.2d, v13.2d}, [pB] // for next round
- add pB, pB, #32
+ ldp d12, d13, [pB]
+ add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0]
- fmla v31.2d, v3.2d, v9.2d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v31.2d, v3.2d, v11.2d[0]
- prfm PLDL1KEEP, [pA, #512]
+ ldp d14, d15, [pB]
+ add pB, pB, #16
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v27.2d, v3.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
- prfm PLDL1KEEP, [ppA, #512]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v22.2d, v2.2d, v9.2d[0]
+ fmla v27.2d, v3.2d, v10.2d[0]
- ld1 {v4.2d, v5.2d} , [pA] // for next round
+ prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+
+ ldp q4, q5, [pA]
add pA, pA, #32
- fmla v26.2d, v2.2d, v9.2d[0]
- fmla v23.2d, v3.2d, v8.2d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v26.2d, v2.2d, v10.2d[0]
+ fmla v23.2d, v3.2d, v9.2d[0]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
fmla v17.2d, v1.2d, v8.2d[0]
- ld1 {v6.2d, v7.2d} , [ppA] // for next round
+ ldp q6, q7, [ppA]
add ppA, ppA, #32
- fmla v30.2d, v2.2d, v9.2d[1]
+ fmla v30.2d, v2.2d, v11.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
.endm
.macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
fmla v18.2d, v6.2d, v12.2d[0]
- fmla v27.2d, v7.2d, v13.2d[0]
+ fmla v27.2d, v7.2d, v14.2d[0]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v29.2d, v5.2d, v13.2d[1]
- fmla v22.2d, v6.2d, v12.2d[1]
- fmla v31.2d, v7.2d, v13.2d[1]
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v22.2d, v6.2d, v13.2d[0]
+ fmla v31.2d, v7.2d, v15.2d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
+ fmla v24.2d, v4.2d, v14.2d[0]
fmla v17.2d, v5.2d, v12.2d[0]
- fmla v26.2d, v6.2d, v13.2d[0]
+ fmla v26.2d, v6.2d, v14.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v21.2d, v5.2d, v12.2d[1]
- fmla v30.2d, v6.2d, v13.2d[1]
- fmla v23.2d, v7.2d, v12.2d[1]
+ fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v30.2d, v6.2d, v15.2d[0]
+ fmla v23.2d, v7.2d, v13.2d[0]
.endm
.macro KERNEL8x4_SUB
- ld1 {v8.2d, v9.2d}, [pB]
- add pB, pB, #32
- ld1 {v0.2d, v1.2d}, [pA]
+ ldp d8, d9, [pB]
+ add pB, pB, #16
+ ldp d10, d11, [pB]
+ add pB, pB, #16
+ ldp q0, q1, [pA]
add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
- ld1 {v2.2d, v3.2d}, [ppA]
+ ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v28.2d, v0.2d, v11.2d[0]
fmla v17.2d, v1.2d, v8.2d[0]
fmla v18.2d, v2.2d, v8.2d[0]
- fmla v31.2d, v3.2d, v9.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v27.2d, v3.2d, v9.2d[0]
+ fmla v31.2d, v3.2d, v11.2d[0]
+ fmla v22.2d, v2.2d, v9.2d[0]
+ fmla v27.2d, v3.2d, v10.2d[0]
- fmla v26.2d, v2.2d, v9.2d[0]
- fmla v23.2d, v3.2d, v8.2d[1]
- fmla v30.2d, v2.2d, v9.2d[1]
+ fmla v26.2d, v2.2d, v10.2d[0]
+ fmla v23.2d, v3.2d, v9.2d[0]
+ fmla v30.2d, v2.2d, v11.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
.endm
.macro SAVE8x4
+ fmov alpha0, alpha
+
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add ppCRow0, pCRow0, #32
- ld1 {v0.2d, v1.2d}, [pCRow0]
+ ldp q0, q1, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
- fmla v1.2d, v17.2d, alphaV1
- st1 {v0.2d, v1.2d}, [pCRow0]
-
- ld1 {v2.2d, v3.2d}, [ppCRow0]
- fmla v2.2d, v18.2d, alphaV2
- fmla v3.2d, v19.2d, alphaV3
- st1 {v2.2d, v3.2d}, [ppCRow0]
-
- add pCRow1, pCRow0, LDC
- add ppCRow1, ppCRow0, LDC
-
- ld1 {v4.2d, v5.2d}, [pCRow1]
- fmla v4.2d, v20.2d, alphaV0
- fmla v5.2d, v21.2d, alphaV1
- st1 {v4.2d, v5.2d}, [pCRow1]
-
- ld1 {v6.2d, v7.2d}, [ppCRow1]
- fmla v6.2d, v22.2d, alphaV2
- fmla v7.2d, v23.2d, alphaV3
- st1 {v6.2d, v7.2d}, [ppCRow1]
-
- add pCRow2, pCRow1, LDC
- add ppCRow2, ppCRow1, LDC
-
- ld1 {v0.2d, v1.2d}, [pCRow2]
- fmla v0.2d, v24.2d, alphaV0
- fmla v1.2d, v25.2d, alphaV1
- st1 {v0.2d, v1.2d}, [pCRow2]
-
- ld1 {v2.2d, v3.2d}, [ppCRow2]
- fmla v2.2d, v26.2d, alphaV2
- fmla v3.2d, v27.2d, alphaV3
- st1 {v2.2d, v3.2d}, [ppCRow2]
-
- add pCRow1, pCRow2, LDC
- add ppCRow1, ppCRow2, LDC
-
- ld1 {v4.2d, v5.2d}, [pCRow1]
- fmla v4.2d, v28.2d, alphaV0
- fmla v5.2d, v29.2d, alphaV1
- st1 {v4.2d, v5.2d}, [pCRow1]
-
- ld1 {v6.2d, v7.2d}, [ppCRow1]
- fmla v6.2d, v30.2d, alphaV2
- fmla v7.2d, v31.2d, alphaV3
- st1 {v6.2d, v7.2d}, [ppCRow1]
+ fmla v1.2d, v17.2d, alphaV0
+ stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #64
+
+ ldp q2, q3, [ppCRow0]
+ fmla v2.2d, v18.2d, alphaV0
+ fmla v3.2d, v19.2d, alphaV0
+ stp q2, q3, [ppCRow0]
+
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add ppCRow1, pCRow1, #32
+
+ ldp q4, q5, [pCRow1]
+ fmla v4.2d, v20.2d, alphaV0
+ fmla v5.2d, v21.2d, alphaV0
+ stp q4, q5, [pCRow1]
+
+ add pCRow1, pCRow1, #64
+
+ ldp q6, q7, [ppCRow1]
+ fmla v6.2d, v22.2d, alphaV0
+ fmla v7.2d, v23.2d, alphaV0
+ stp q6, q7, [ppCRow1]
+
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+ add ppCRow2, pCRow2, #32
+
+ ldp q0, q1, [pCRow2]
+ fmla v0.2d, v24.2d, alphaV0
+ fmla v1.2d, v25.2d, alphaV0
+ stp q0, q1, [pCRow2]
+
+ add pCRow2, pCRow2, #64
+
+ ldp q2, q3, [ppCRow2]
+ fmla v2.2d, v26.2d, alphaV0
+ fmla v3.2d, v27.2d, alphaV0
+ stp q2, q3, [ppCRow2]
+
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+ add ppCRow3, pCRow3, #32
+
+ ldp q4, q5, [pCRow3]
+ fmla v4.2d, v28.2d, alphaV0
+ fmla v5.2d, v29.2d, alphaV0
+ stp q4, q5, [pCRow3]
+
+ add pCRow3, pCRow3, #64
+
+ ldp q6, q7, [ppCRow3]
+ fmla v6.2d, v30.2d, alphaV0
+ fmla v7.2d, v31.2d, alphaV0
+ stp q6, q7, [ppCRow3]
.endm
/******************************************************************************/
@@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
+ fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
- fmla v9.2d, v17.2d, alphaV1
+ fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV2
- fmla v13.2d, v21.2d, alphaV3
+ fmla v12.2d, v20.2d, alphaV0
+ fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
- fmla v9.2d, v25.2d, alphaV1
+ fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
- fmla v12.2d, v28.2d, alphaV2
- fmla v13.2d, v29.2d, alphaV3
+ fmla v12.2d, v28.2d, alphaV0
+ fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
+ fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1, pCRow0, LDC
ld1 {v12.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV1
+ fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
ld1 {v8.2d}, [pCRow2]
- fmla v8.2d, v24.2d, alphaV2
+ fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
ld1 {v12.2d}, [pCRow1]
- fmla v12.2d, v28.2d, alphaV3
+ fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
+ fmov alpha0, alpha
+
add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
@@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1]
- fmla v12.2d, v20.2d, alphaV1
+ fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
@@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
+ fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
- fmla v9.2d, v17.2d, alphaV1
+ fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV2
- fmla v13.2d, v21.2d, alphaV3
+ fmla v12.2d, v20.2d, alphaV0
+ fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
+ fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1 , pCRow0, LDC
ld1 {v12.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV1
+ fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
+ fmov alpha0, alpha
+
add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
@@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
+ fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
- fmla v9.2d, v17.2d, alphaV1
+ fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32
@@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
+ fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
+ fmov alpha0, alpha
+
ldr d8, [pCRow0]
fmadd d8, d16, alpha0, d8
str d8, [pCRow0]
@@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha0, d0
- fmov alpha1, d0
- fmov alpha2, d0
- fmov alpha3, d0
+ fmov alpha, d0
+ prfm PLDL1KEEP, [origPA]
+ prfm PLDL1KEEP, [origPB]
lsl LDC, LDC, #3 // ldc = ldc * 8
@@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble dgemm_kernel_L2_BEGIN
dgemm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+ add pC, pCRow3, LDC
lsl temp, origK, #5 // k * 4 * 8
mov pA, origPA // pA = start of A array
add ppA, temp, pA
+ prfm PLDL1KEEP, [ppA]
//------------------------------------------------------------------------------
@@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
+ .align 5
dgemm_kernel_L4_M8_20:
mov pB, origPB
- asr counterL , origK, #1 // L = K / 2
- cmp counterL , #2 // is there at least 4 to do?
+ asr counterL , origK, #2 // L = K / 4
+ cmp counterL , #2
blt dgemm_kernel_L4_M8_32
- KERNEL8x4_I // do one in the K
- KERNEL8x4_M2 // do another in the K
+ KERNEL8x4_I
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
+
.align 5
-
dgemm_kernel_L4_M8_22:
-
+ KERNEL8x4_M1
+ KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
-
+ .align 5
dgemm_kernel_L4_M8_22a:
+ KERNEL8x4_M1
+ KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
+ .align 5
dgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
KERNEL8x4_I
-
+ KERNEL8x4_M2
+ KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
@@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40:
dgemm_kernel_L4_M8_44:
- ands counterL , origK, #1
+ ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100
+ .align 5
dgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
+ subs counterL, counterL, #1
+ bne dgemm_kernel_L4_M8_46
+
dgemm_kernel_L4_M8_100:
+ lsl temp, origK, #5
+ prfm PLDL1KEEP, [pA, temp]
+ prfm PLDL1KEEP, [ppA, temp]
+ prfm PLDL1KEEP, [origPB]
SAVE8x4
@@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
-
dgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S
new file mode 100755
index 000000000..88e9a773d
--- /dev/null
+++ b/kernel/arm64/dgemm_kernel_4x8.S
@@ -0,0 +1,1689 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define temp x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+
+#define alpha0 d2
+#define alphaV0 v2.d[0]
+#define alpha1 d3
+#define alphaV1 v3.d[0]
+#define alpha2 d6
+#define alphaV2 v6.d[0]
+#define alpha3 d7
+#define alphaV3 v7.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA00, pA01
+//v01 pA02, pA03
+//v02 ALPHA0
+//v03 ALPHA1
+//v04 pA10, pA11
+//v05 pA12, pA13
+//v06 ALPHA2
+//v07 ALPHA3
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save pB0_4, pB0_5
+//v11 must save pB0_6, pB0_7
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save pB1_4, pB1_5
+//v15 must save pB1_6, pB1_7
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT4x8
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, xzr
+ fmov d19, d16
+ fmov d20, xzr
+ fmov d21, d16
+ fmov d22, d17
+ fmov d23, d18
+ fmov d24, xzr
+ fmov d25, d16
+ fmov d26, d17
+ fmov d27, d18
+ fmov d28, xzr
+ fmov d29, d16
+ fmov d30, d17
+ fmov d31, d18
+.endm
+
+.macro KERNEL4x8_I
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmul v16.2d, v0.2d, v8.2d[0]
+ fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v18.2d, v0.2d, v8.2d[1]
+ fmul v19.2d, v1.2d, v8.2d[1]
+
+ fmul v20.2d, v0.2d, v9.2d[0]
+ fmul v21.2d, v1.2d, v9.2d[0]
+ fmul v22.2d, v0.2d, v9.2d[1]
+ fmul v23.2d, v1.2d, v9.2d[1]
+
+ fmul v24.2d, v0.2d, v10.2d[0]
+ fmul v25.2d, v1.2d, v10.2d[0]
+ fmul v26.2d, v0.2d, v10.2d[1]
+ fmul v27.2d, v1.2d, v10.2d[1]
+
+ fmul v28.2d, v0.2d, v11.2d[0]
+ fmul v29.2d, v1.2d, v11.2d[0]
+ fmul v30.2d, v0.2d, v11.2d[1]
+ fmul v31.2d, v1.2d, v11.2d[1]
+
+ ld1 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v4.2d, v5.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
+.endm
+
+.macro KERNEL4x8_M1
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v19.2d, v1.2d, v8.2d[1]
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v23.2d, v1.2d, v9.2d[1]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v27.2d, v1.2d, v10.2d[1]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v31.2d, v1.2d, v11.2d[1]
+
+ ld1 {v12.2d, v13.2d}, [pB] // For next round
+ add pB, pB, #32
+ ld1 {v4.2d, v5.2d}, [pA] // For next round
+ add pA, pA, #32
+ ld1 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
+
+ prfm PLDL1KEEP, [pA, #512]
+.endm
+
+.macro KERNEL4x8_M2
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v18.2d, v4.2d, v12.2d[1]
+ fmla v19.2d, v5.2d, v12.2d[1]
+
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v22.2d, v4.2d, v13.2d[1]
+ fmla v23.2d, v5.2d, v13.2d[1]
+
+ fmla v24.2d, v4.2d, v14.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v26.2d, v4.2d, v14.2d[1]
+ fmla v27.2d, v5.2d, v14.2d[1]
+
+ fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v30.2d, v4.2d, v15.2d[1]
+ fmla v31.2d, v5.2d, v15.2d[1]
+
+ ld1 {v8.2d, v9.2d}, [pB] // For next round
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA] // For next round
+ add pA, pA, #32
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ prfm PLDL1KEEP, [pB, #512]
+.endm
+
+.macro KERNEL4x8_E
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v18.2d, v4.2d, v12.2d[1]
+ fmla v19.2d, v5.2d, v12.2d[1]
+
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v22.2d, v4.2d, v13.2d[1]
+ fmla v23.2d, v5.2d, v13.2d[1]
+
+ fmla v24.2d, v4.2d, v14.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v26.2d, v4.2d, v14.2d[1]
+ fmla v27.2d, v5.2d, v14.2d[1]
+
+ fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v30.2d, v4.2d, v15.2d[1]
+ fmla v31.2d, v5.2d, v15.2d[1]
+.endm
+
+.macro KERNEL4x8_SUB
+ ld1 {v8.2d, v9.2d}, [pB] // For next round
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA] // For next round
+ add pA, pA, #32
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v19.2d, v1.2d, v8.2d[1]
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v23.2d, v1.2d, v9.2d[1]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v27.2d, v1.2d, v10.2d[1]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v31.2d, v1.2d, v11.2d[1]
+.endm
+
+.macro SAVE4x8
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v8.2d, v9.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ fmla v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v10.2d, v11.2d}, [pCRow1]
+ fmla v10.2d, v18.2d, alphaV2
+ fmla v11.2d, v19.2d, alphaV3
+ st1 {v10.2d, v11.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow2]
+ fmla v12.2d, v20.2d, alphaV0
+ fmla v13.2d, v21.2d, alphaV1
+ st1 {v12.2d, v13.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v14.2d, v15.2d}, [pCRow1]
+ fmla v14.2d, v22.2d, alphaV2
+ fmla v15.2d, v23.2d, alphaV3
+ st1 {v14.2d, v15.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v8.2d, v9.2d}, [pCRow2]
+ fmla v8.2d, v24.2d, alphaV0
+ fmla v9.2d, v25.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v10.2d, v11.2d}, [pCRow1]
+ fmla v10.2d, v26.2d, alphaV2
+ fmla v11.2d, v27.2d, alphaV3
+ st1 {v10.2d, v11.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow2]
+ fmla v12.2d, v28.2d, alphaV0
+ fmla v13.2d, v29.2d, alphaV1
+ st1 {v12.2d, v13.2d}, [pCRow2]
+
+ ld1 {v14.2d, v15.2d}, [pCRow1]
+ fmla v14.2d, v30.2d, alphaV2
+ fmla v15.2d, v31.2d, alphaV3
+ st1 {v14.2d, v15.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+ fmov d16, xzr
+ fmov d18, xzr
+ fmov d20, xzr
+ fmov d22, d16
+ fmov d24, xzr
+ fmov d26, d16
+ fmov d28, xzr
+ fmov d30, d16
+.endm
+
+.macro KERNEL2x8_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v18.2d, v0.2d, v8.2d[1]
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v22.2d, v0.2d, v9.2d[1]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v26.2d, v0.2d, v10.2d[1]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v30.2d, v0.2d, v11.2d[1]
+.endm
+
+.macro SAVE2x8
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v8.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v10.2d}, [pCRow1]
+ fmla v10.2d, v18.2d, alphaV2
+ st1 {v10.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d}, [pCRow2]
+ fmla v12.2d, v20.2d, alphaV0
+ st1 {v12.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v14.2d}, [pCRow1]
+ fmla v14.2d, v22.2d, alphaV2
+ st1 {v14.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v8.2d}, [pCRow2]
+ fmla v8.2d, v24.2d, alphaV0
+ st1 {v8.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v10.2d}, [pCRow1]
+ fmla v10.2d, v26.2d, alphaV2
+ st1 {v10.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d}, [pCRow2]
+ fmla v12.2d, v28.2d, alphaV0
+ st1 {v12.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v14.2d}, [pCRow1]
+ fmla v14.2d, v30.2d, alphaV2
+ st1 {v14.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+ fmov d16, xzr
+ fmov d20, xzr
+ fmov d24, xzr
+ fmov d28, xzr
+.endm
+
+.macro KERNEL1x8_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ldr d0, [pA]
+ add pA, pA, #8
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v8.2d, v0.d[0]
+ fmla v20.2d, v9.2d, v0.d[0]
+ fmla v24.2d, v10.2d, v0.d[0]
+ fmla v28.2d, v11.2d, v0.d[0]
+.endm
+
+.macro SAVE1x8
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v8.d}[0], [pCRow0]
+ ld1 {v8.d}[1], [pCRow1]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v10.d}[0], [pCRow2]
+ ld1 {v10.d}[1], [pCRow1]
+ fmla v10.2d, v20.2d, alphaV1
+ st1 {v10.d}[0], [pCRow2]
+ st1 {v10.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.d}[0], [pCRow2]
+ ld1 {v12.d}[1], [pCRow1]
+ fmla v12.2d, v24.2d, alphaV2
+ st1 {v12.d}[0], [pCRow2]
+ st1 {v12.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v14.d}[0], [pCRow2]
+ ld1 {v14.d}[1], [pCRow1]
+ fmla v14.2d, v28.2d, alphaV3
+ st1 {v14.d}[0], [pCRow2]
+ st1 {v14.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+ fmov d24, d17
+ fmov d25, d16
+ fmov d28, d17
+ fmov d29, d16
+.endm
+
+.macro KERNEL4x4_I
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmul v16.2d, v0.2d, v8.2d[0]
+ fmul v29.2d, v1.2d, v9.2d[1]
+
+ fmul v20.2d, v0.2d, v8.2d[1]
+ fmul v25.2d, v1.2d, v9.2d[0]
+
+ fmul v24.2d, v0.2d, v9.2d[0]
+ fmul v21.2d, v1.2d, v8.2d[1]
+
+ fmul v28.2d, v0.2d, v9.2d[1]
+ fmul v17.2d, v1.2d, v8.2d[0]
+
+ ld1 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v4.2d, v5.2d}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v29.2d, v1.2d, v9.2d[1]
+
+ ld1 {v12.2d, v13.2d}, [pB] // For next round
+ add pB, pB, #32
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v25.2d, v1.2d, v9.2d[0]
+
+ ld1 {v4.2d, v5.2d}, [pA] // For next round
+ add pA, pA, #32
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v8.2d[1]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro KERNEL4x4_M2
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v29.2d, v5.2d, v13.2d[1]
+
+ ld1 {v8.2d, v9.2d}, [pB] // For next round
+ add pB, pB, #32
+
+ fmla v20.2d, v4.2d, v12.2d[1]
+ fmla v25.2d, v5.2d, v13.2d[0]
+
+ ld1 {v0.2d, v1.2d}, [pA] // For next round
+ add pA, pA, #32
+
+ fmla v24.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v12.2d[1]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ fmla v28.2d, v4.2d, v13.2d[1]
+ fmla v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_E
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v29.2d, v5.2d, v13.2d[1]
+
+ fmla v20.2d, v4.2d, v12.2d[1]
+ fmla v25.2d, v5.2d, v13.2d[0]
+
+ fmla v24.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v12.2d[1]
+
+ fmla v28.2d, v4.2d, v13.2d[1]
+ fmla v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v29.2d, v1.2d, v9.2d[1]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v25.2d, v1.2d, v9.2d[0]
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v8.2d[1]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+ ld1 {v8.2d, v9.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ fmla v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV2
+ fmla v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v8.2d, v9.2d}, [pCRow2]
+ fmla v8.2d, v24.2d, alphaV0
+ fmla v9.2d, v25.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow1]
+ fmla v12.2d, v28.2d, alphaV2
+ fmla v13.2d, v29.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov d16, xzr
+ fmov d20, d16
+ fmov d24, d20
+ fmov d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+ ld1 {v8.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v12.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v8.2d}, [pCRow2]
+ fmla v8.2d, v24.2d, alphaV2
+ st1 {v8.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d}, [pCRow1]
+ fmla v12.2d, v28.2d, alphaV3
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr d0, [pA]
+ add pA, pA, #8
+
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v8.2d, v0.d[0]
+ fmla v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v8.d}[0], [pCRow0]
+ ld1 {v8.d}[1], [pCRow1]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.d}[0], [pCRow2]
+ ld1 {v12.d}[1], [pCRow1]
+ fmla v12.2d, v20.2d, alphaV1
+ st1 {v12.d}[0], [pCRow2]
+ st1 {v12.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+ ld1 {v8.2d, v9.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ fmla v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV2
+ fmla v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+ ld1 {v8.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+
+ ld1 {v12.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2d} , [pB]
+ add pB , pB, #16
+
+ ldr d0 , [pA]
+ add pA, pA, #8
+
+ fmla v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+
+ ld1 {v8.d}[0], [pCRow0]
+ ld1 {v8.d}[1], [pCRow1]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov d16, xzr
+ fmov d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA , pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+ ld1 {v8.2d, v9.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ fmla v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+ ld1 {v8.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ldr d0, [pA]
+ add pA , pA, #8
+
+ fmadd d16, d0, d8, d16
+.endm
+
+.macro SAVE1x1
+ ldr d8, [pCRow0]
+ fmadd d8, d16, alpha0, d8
+ str d8, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0, d0
+ fmov alpha1, d0
+ fmov alpha2, d0
+ fmov alpha3, d0
+
+ lsl LDC, LDC, #3 // ldc = ldc * 8
+
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #3 // J = J / 8
+ cmp counterJ, #0
+ ble dgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+
+dgemm_kernel_L8_BEGIN:
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #3
+
+ mov pA, origPA // pA = start of A array
+
+dgemm_kernel_L8_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble dgemm_kernel_L8_M2_BEGIN
+
+dgemm_kernel_L8_M4_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt dgemm_kernel_L8_M4_32
+
+ KERNEL4x8_I // do one in the K
+ KERNEL4x8_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble dgemm_kernel_L8_M4_22a
+ .align 5
+
+dgemm_kernel_L8_M4_22:
+
+ KERNEL4x8_M1
+ KERNEL4x8_M2
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L8_M4_22
+
+
+dgemm_kernel_L8_M4_22a:
+
+ KERNEL4x8_M1
+ KERNEL4x8_E
+
+ b dgemm_kernel_L8_M4_44
+
+dgemm_kernel_L8_M4_32:
+
+ tst counterL, #1
+ ble dgemm_kernel_L8_M4_40
+
+ KERNEL4x8_I
+
+ KERNEL4x8_E
+
+ b dgemm_kernel_L8_M4_44
+
+
+dgemm_kernel_L8_M4_40:
+
+ INIT4x8
+
+dgemm_kernel_L8_M4_44:
+
+ ands counterL , origK, #1
+ ble dgemm_kernel_L8_M4_100
+
+dgemm_kernel_L8_M4_46:
+
+ KERNEL4x8_SUB
+
+dgemm_kernel_L8_M4_100:
+
+ SAVE4x8
+
+dgemm_kernel_L8_M4_END:
+ subs counterI, counterI, #1
+ bne dgemm_kernel_L8_M4_20
+
+dgemm_kernel_L8_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dgemm_kernel_L8_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dgemm_kernel_L8_M1_BEGIN
+
+dgemm_kernel_L8_M2_20:
+
+ INIT2x8
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L8_M2_40
+
+dgemm_kernel_L8_M2_22:
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L8_M2_22
+
+
+dgemm_kernel_L8_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L8_M2_100
+
+dgemm_kernel_L8_M2_42:
+
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L8_M2_42
+
+dgemm_kernel_L8_M2_100:
+
+ SAVE2x8
+
+dgemm_kernel_L8_M2_END:
+
+
+dgemm_kernel_L8_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dgemm_kernel_L8_END
+
+dgemm_kernel_L8_M1_20:
+
+ INIT1x8
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L8_M1_40
+
+dgemm_kernel_L8_M1_22:
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L8_M1_22
+
+
+dgemm_kernel_L8_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L8_M1_100
+
+dgemm_kernel_L8_M1_42:
+
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L8_M1_42
+
+dgemm_kernel_L8_M1_100:
+
+ SAVE1x8
+
+dgemm_kernel_L8_END:
+
+ lsl temp, origK, #6
+ add origPB, origPB, temp // B = B + K * 8 * 8
+
+ subs counterJ, counterJ , #1 // j--
+ bgt dgemm_kernel_L8_BEGIN
+
+
+/******************************************************************************/
+
+dgemm_kernel_L4_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #7
+ ble dgemm_kernel_L999
+
+ tst counterJ , #4
+ ble dgemm_kernel_L2_BEGIN
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #2
+
+ mov pA, origPA // pA = start of A array
+
+dgemm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble dgemm_kernel_L4_M2_BEGIN
+
+dgemm_kernel_L4_M4_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt dgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble dgemm_kernel_L4_M4_22a
+ .align 5
+
+dgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M4_22
+
+
+dgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_32:
+
+ tst counterL, #1
+ ble dgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+
+ KERNEL4x4_E
+
+ b dgemm_kernel_L4_M4_44
+
+
+dgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+dgemm_kernel_L4_M4_44:
+
+ ands counterL , origK, #1
+ ble dgemm_kernel_L4_M4_100
+
+dgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+dgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+dgemm_kernel_L4_M4_END:
+ subs counterI, counterI, #1
+ bne dgemm_kernel_L4_M4_20
+
+dgemm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dgemm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dgemm_kernel_L4_M1_BEGIN
+
+dgemm_kernel_L4_M2_20:
+
+ INIT2x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L4_M2_40
+
+dgemm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M2_22
+
+
+dgemm_kernel_L4_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L4_M2_100
+
+dgemm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M2_42
+
+dgemm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+dgemm_kernel_L4_M2_END:
+
+
+dgemm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dgemm_kernel_L4_END
+
+dgemm_kernel_L4_M1_20:
+
+ INIT1x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L4_M1_40
+
+dgemm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M1_22
+
+
+dgemm_kernel_L4_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L4_M1_100
+
+dgemm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M1_42
+
+dgemm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+dgemm_kernel_L4_END:
+
+ lsl temp, origK, #5
+ add origPB, origPB, temp // B = B + K * 4 * 8
+
+/******************************************************************************/
+
+dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble dgemm_kernel_L999 // error, N was less than 4?
+
+ tst counterJ , #2
+ ble dgemm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+ mov pA, origPA // pA = A
+
+
+dgemm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI,#0
+ ble dgemm_kernel_L2_M2_BEGIN
+
+dgemm_kernel_L2_M4_20:
+
+ INIT4x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dgemm_kernel_L2_M4_40
+ .align 5
+
+dgemm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M4_22
+
+
+dgemm_kernel_L2_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L2_M4_100
+
+dgemm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M4_42
+
+dgemm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+dgemm_kernel_L2_M4_END:
+
+ subs counterI, counterI, #1
+ bgt dgemm_kernel_L2_M4_20
+
+
+dgemm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dgemm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dgemm_kernel_L2_M1_BEGIN
+
+dgemm_kernel_L2_M2_20:
+
+ INIT2x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dgemm_kernel_L2_M2_40
+
+dgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M2_22
+
+
+dgemm_kernel_L2_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L2_M2_100
+
+dgemm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M2_42
+
+dgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+dgemm_kernel_L2_M2_END:
+
+
+dgemm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dgemm_kernel_L2_END
+
+dgemm_kernel_L2_M1_20:
+
+ INIT1x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble dgemm_kernel_L2_M1_40
+
+dgemm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M1_22
+
+
+dgemm_kernel_L2_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L2_M1_100
+
+dgemm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M1_42
+
+dgemm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+dgemm_kernel_L2_END:
+ add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dgemm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble dgemm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+ mov pA, origPA // pA = A
+
+dgemm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble dgemm_kernel_L1_M2_BEGIN
+
+dgemm_kernel_L1_M4_20:
+
+ INIT4x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L1_M4_40
+ .align 5
+
+dgemm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M4_22
+
+
+dgemm_kernel_L1_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L1_M4_100
+
+dgemm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M4_42
+
+dgemm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+dgemm_kernel_L1_M4_END:
+
+ subs counterI, counterI, #1
+ bgt dgemm_kernel_L1_M4_20
+
+
+dgemm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dgemm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dgemm_kernel_L1_M1_BEGIN
+
+dgemm_kernel_L1_M2_20:
+
+ INIT2x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L1_M2_40
+
+dgemm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M2_22
+
+
+dgemm_kernel_L1_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L1_M2_100
+
+dgemm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M2_42
+
+dgemm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+dgemm_kernel_L1_M2_END:
+
+
+dgemm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dgemm_kernel_L1_END
+
+dgemm_kernel_L1_M1_20:
+
+ INIT1x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L1_M1_40
+
+dgemm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M1_22
+
+
+dgemm_kernel_L1_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L1_M1_100
+
+dgemm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M1_42
+
+dgemm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+
+dgemm_kernel_L1_END:
+
+
+dgemm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S
new file mode 100755
index 000000000..a607fecc4
--- /dev/null
+++ b/kernel/arm64/dgemm_kernel_8x4.S
@@ -0,0 +1,1570 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define temp x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pCRow3 x15
+#define pA x16
+#define alpha x17
+
+#define alpha0 d10
+#define alphaV0 v10.d[0]
+#define alpha1 d11
+#define alphaV1 v11.d[0]
+#define alpha2 d14
+#define alphaV2 v14.d[0]
+#define alpha3 d15
+#define alphaV3 v15.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1
+//v01 pA0_2, pA0_3
+//v02 pA0_4, pA0_5
+//v03 pA0_6, pA0_7
+//v04 pA1_0, pA1_1
+//v05 pA1_2, pA1_3
+//v06 pA1_4, pA1_5
+//v07 pA1_6, pA1_7
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, d16
+ fmov d19, xzr
+ fmov d20, xzr
+ fmov d21, d16
+ fmov d22, d17
+ fmov d23, d18
+ fmov d24, xzr
+ fmov d25, d16
+ fmov d26, d17
+ fmov d27, d18
+ fmov d28, xzr
+ fmov d29, d16
+ fmov d30, d17
+ fmov d31, d18
+.endm
+
+.macro KERNEL8x4_I
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+ ldp d8, d9, [pB]
+ add pB, pB, #16
+ ldp d10, d11, [pB]
+ add pB, pB, #16
+
+ fmul v16.2d, v0.2d, v8.2d[0]
+ fmul v17.2d, v1.2d, v8.2d[0]
+
+ fmul v18.2d, v2.2d, v8.2d[0]
+ fmul v19.2d, v3.2d, v8.2d[0]
+
+ fmul v20.2d, v0.2d, v9.2d[0]
+ fmul v21.2d, v1.2d, v9.2d[0]
+
+ fmul v22.2d, v2.2d, v9.2d[0]
+ fmul v23.2d, v3.2d, v9.2d[0]
+
+ fmul v24.2d, v0.2d, v10.2d[0]
+ fmul v25.2d, v1.2d, v10.2d[0]
+
+ fmul v26.2d, v2.2d, v10.2d[0]
+ fmul v27.2d, v3.2d, v10.2d[0]
+
+ fmul v28.2d, v0.2d, v11.2d[0]
+ fmul v29.2d, v1.2d, v11.2d[0]
+
+ fmul v30.2d, v2.2d, v11.2d[0]
+ fmul v31.2d, v3.2d, v11.2d[0]
+
+ ld1 {v4.2d, v5.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v6.2d, v7.2d}, [pA]
+ add pA, pA, #32
+ ldp d12, d13, [pB]
+ add pB, pB, #16
+ ldp d14, d15, [pB]
+ add pB, pB, #16
+.endm
+
+.macro KERNEL8x4_M1
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v26.2d, v2.2d, v10.2d[0]
+ fmla v31.2d, v3.2d, v11.2d[0]
+
+ ld1 {v4.2d}, [pA], #16
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+
+ ld1 {v5.2d}, [pA], #16
+
+ fmla v30.2d, v2.2d, v11.2d[0]
+ fmla v27.2d, v3.2d, v10.2d[0]
+
+ ldp d12, d13, [pB]
+ add pB, pB, #16
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
+
+ ldp d14, d15, [pB]
+ add pB, pB, #16
+
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v23.2d, v3.2d, v9.2d[0]
+
+ ld1 {v6.2d}, [pA], #16
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v29.2d, v1.2d, v11.2d[0]
+
+ ld1 {v7.2d}, [pA], #16
+
+ fmla v22.2d, v2.2d, v9.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+
+ prfm PLDL1KEEP, [pA, #224]
+ prfm PLDL1KEEP, [pA, #224+64]
+.endm
+
+.macro KERNEL8x4_M2
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v26.2d, v6.2d, v14.2d[0]
+ fmla v31.2d, v7.2d, v15.2d[0]
+
+ ld1 {v0.2d}, [pA], #16
+
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+
+ ld1 {v1.2d}, [pA], #16
+
+ fmla v30.2d, v6.2d, v15.2d[0]
+ fmla v27.2d, v7.2d, v14.2d[0]
+
+ ldp d8, d9, [pB]
+ add pB, pB, #16
+
+ fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
+
+ ldp d10, d11, [pB]
+ add pB, pB, #16
+
+ fmla v22.2d, v6.2d, v13.2d[0]
+ fmla v19.2d, v7.2d, v12.2d[0]
+
+ ld1 {v2.2d}, [pA], #16
+
+ fmla v24.2d, v4.2d, v14.2d[0]
+ fmla v29.2d, v5.2d, v15.2d[0]
+
+ ld1 {v3.2d}, [pA], #16
+
+ fmla v18.2d, v6.2d, v12.2d[0]
+ fmla v23.2d, v7.2d, v13.2d[0]
+
+ prfm PLDL1KEEP, [pB, #640]
+.endm
+
+.macro KERNEL8x4_E
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v18.2d, v6.2d, v12.2d[0]
+ fmla v19.2d, v7.2d, v12.2d[0]
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v22.2d, v6.2d, v13.2d[0]
+ fmla v23.2d, v7.2d, v13.2d[0]
+ fmla v24.2d, v4.2d, v14.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v26.2d, v6.2d, v14.2d[0]
+ fmla v27.2d, v7.2d, v14.2d[0]
+ fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v30.2d, v6.2d, v15.2d[0]
+ fmla v31.2d, v7.2d, v15.2d[0]
+.endm
+
+.macro KERNEL8x4_SUB
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+ ldp d8, d9, [pB]
+ add pB, pB, #16
+ ldp d10, d11, [pB]
+ add pB, pB, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v22.2d, v2.2d, v9.2d[0]
+ fmla v23.2d, v3.2d, v9.2d[0]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v26.2d, v2.2d, v10.2d[0]
+ fmla v27.2d, v3.2d, v10.2d[0]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v30.2d, v2.2d, v11.2d[0]
+ fmla v31.2d, v3.2d, v11.2d[0]
+.endm
+
+.macro SAVE8x4
+ fmov alpha0, alpha
+
+ ld1 {v0.2d, v1.2d}, [pCRow0]
+ fmla v0.2d, v16.2d, alphaV0
+ fmla v1.2d, v17.2d, alphaV0
+ st1 {v0.2d, v1.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+
+ ld1 {v2.2d, v3.2d}, [pCRow0]
+ fmla v2.2d, v18.2d, alphaV0
+ fmla v3.2d, v19.2d, alphaV0
+ st1 {v2.2d, v3.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+
+ ld1 {v4.2d, v5.2d}, [pCRow1]
+ fmla v4.2d, v20.2d, alphaV0
+ fmla v5.2d, v21.2d, alphaV0
+ st1 {v4.2d, v5.2d}, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+
+ ld1 {v6.2d, v7.2d}, [pCRow1]
+ fmla v6.2d, v22.2d, alphaV0
+ fmla v7.2d, v23.2d, alphaV0
+ st1 {v6.2d, v7.2d}, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+
+ ld1 {v0.2d, v1.2d}, [pCRow2]
+ fmla v0.2d, v24.2d, alphaV0
+ fmla v1.2d, v25.2d, alphaV0
+ st1 {v0.2d, v1.2d}, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+ ld1 {v2.2d, v3.2d}, [pCRow2]
+ fmla v2.2d, v26.2d, alphaV0
+ fmla v3.2d, v27.2d, alphaV0
+ st1 {v2.2d, v3.2d}, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+
+ ld1 {v4.2d, v5.2d}, [pCRow3]
+ fmla v4.2d, v28.2d, alphaV0
+ fmla v5.2d, v29.2d, alphaV0
+ st1 {v4.2d, v5.2d}, [pCRow3]
+
+ add pCRow3, pCRow3, #32
+
+ ld1 {v6.2d, v7.2d}, [pCRow3]
+ fmla v6.2d, v30.2d, alphaV0
+ fmla v7.2d, v31.2d, alphaV0
+ st1 {v6.2d, v7.2d}, [pCRow3]
+
+ add pCRow3, pCRow3, #32
+
+ prfm PLDL2KEEP, [pCRow0, #128]
+ prfm PLDL2KEEP, [pCRow1, #128]
+ prfm PLDL2KEEP, [pCRow2, #128]
+ prfm PLDL2KEEP, [pCRow3, #128]
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+ fmov d24, d17
+ fmov d25, d16
+ fmov d28, d17
+ fmov d29, d16
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v29.2d, v1.2d, v9.2d[1]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v25.2d, v1.2d, v9.2d[0]
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v8.2d[1]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+ ld1 {v8.2d, v9.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ fmla v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV2
+ fmla v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v8.2d, v9.2d}, [pCRow2]
+ fmla v8.2d, v24.2d, alphaV0
+ fmla v9.2d, v25.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow1]
+ fmla v12.2d, v28.2d, alphaV2
+ fmla v13.2d, v29.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT2x4
+ fmov d16, xzr
+ fmov d20, d16
+ fmov d24, d20
+ fmov d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+ ld1 {v8.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v12.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v8.2d}, [pCRow2]
+ fmla v8.2d, v24.2d, alphaV2
+ st1 {v8.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.2d}, [pCRow1]
+ fmla v12.2d, v28.2d, alphaV3
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr d0, [pA]
+ add pA, pA, #8
+
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v8.2d, v0.d[0]
+ fmla v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v8.d}[0], [pCRow0]
+ ld1 {v8.d}[1], [pCRow1]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v12.d}[0], [pCRow2]
+ ld1 {v12.d}[1], [pCRow1]
+ fmla v12.2d, v20.2d, alphaV1
+ st1 {v12.d}[0], [pCRow2]
+ st1 {v12.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, d16
+ fmov d19, d17
+ fmov d20, xzr
+ fmov d21, d16
+ fmov d22, d17
+ fmov d23, d18
+.endm
+
+.macro KERNEL8x2_SUB
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v22.2d, v2.2d, v8.2d[1]
+ fmla v23.2d, v3.2d, v8.2d[1]
+.endm
+
+.macro SAVE8x2
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+ fmla v0.2d, v16.2d, alphaV0
+ fmla v1.2d, v17.2d, alphaV1
+ fmla v2.2d, v18.2d, alphaV2
+ fmla v3.2d, v19.2d, alphaV3
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+ ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+ fmla v4.2d, v20.2d, alphaV0
+ fmla v5.2d, v21.2d, alphaV1
+ fmla v6.2d, v22.2d, alphaV2
+ fmla v7.2d, v23.2d, alphaV3
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+ ld1 {v8.2d, v9.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ fmla v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v12.2d, v13.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV2
+ fmla v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+ ld1 {v8.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+
+ ld1 {v12.2d}, [pCRow1]
+ fmla v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2d} , [pB]
+ add pB , pB, #16
+
+ ldr d0 , [pA]
+ add pA, pA, #8
+
+ fmla v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+
+ ld1 {v8.d}[0], [pCRow0]
+ ld1 {v8.d}[1], [pCRow1]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, d16
+ fmov d19, d17
+.endm
+
+.macro KERNEL8x1_SUB
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA , pA, #32
+
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+.endm
+
+.macro SAVE8x1
+ ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+ fmla v0.2d, v16.2d, alphaV0
+ fmla v1.2d, v17.2d, alphaV1
+ fmla v2.2d, v18.2d, alphaV2
+ fmla v3.2d, v19.2d, alphaV3
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov d16, xzr
+ fmov d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA , pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+ ld1 {v8.2d, v9.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ fmla v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+ ld1 {v8.2d}, [pCRow0]
+ fmla v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ldr d0, [pA]
+ add pA , pA, #8
+
+ fmadd d16, d0, d8, d16
+.endm
+
+.macro SAVE1x1
+ ldr d8, [pCRow0]
+ fmadd d8, d16, alpha0, d8
+ str d8, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha, d0
+
+ lsl LDC, LDC, #3 // ldc = ldc * 8
+
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #2 // J = J / 4
+ cmp counterJ, #0
+ ble dgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+dgemm_kernel_L4_BEGIN:
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+ add pC, pCRow3, LDC
+
+ mov pA, origPA // pA = start of A array
+
+dgemm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble dgemm_kernel_L4_M4_BEGIN
+
+dgemm_kernel_L4_M8_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // L = K / 8
+ cmp counterL , #2 // is there at least 4 to do?
+ blt dgemm_kernel_L4_M8_32
+
+ KERNEL8x4_I
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #2 // subtract 2
+ ble dgemm_kernel_L4_M8_22a
+ .align 5
+
+dgemm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M8_22
+
+
+dgemm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b dgemm_kernel_L4_M8_44
+
+dgemm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble dgemm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b dgemm_kernel_L4_M8_44
+
+dgemm_kernel_L4_M8_40:
+
+ INIT8x4
+
+dgemm_kernel_L4_M8_44:
+
+ ands counterL , origK, #7
+ ble dgemm_kernel_L4_M8_100
+
+dgemm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+ subs counterL, counterL, #1
+ bne dgemm_kernel_L4_M8_46
+
+dgemm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+dgemm_kernel_L4_M8_END:
+ subs counterI, counterI, #1
+ bne dgemm_kernel_L4_M8_20
+
+dgemm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble dgemm_kernel_L4_END
+
+ tst counterI, #4
+ ble dgemm_kernel_L4_M2_BEGIN
+
+dgemm_kernel_L4_M4_20:
+
+ INIT4x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L4_M4_40
+
+dgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M4_22
+
+dgemm_kernel_L4_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L4_M4_100
+
+dgemm_kernel_L4_M4_42:
+
+ KERNEL4x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M4_42
+
+dgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+dgemm_kernel_L4_M4_END:
+
+
+dgemm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dgemm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dgemm_kernel_L4_M1_BEGIN
+
+dgemm_kernel_L4_M2_20:
+
+ INIT2x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L4_M2_40
+
+dgemm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M2_22
+
+
+dgemm_kernel_L4_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L4_M2_100
+
+dgemm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M2_42
+
+dgemm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+dgemm_kernel_L4_M2_END:
+
+
+dgemm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dgemm_kernel_L4_END
+
+dgemm_kernel_L4_M1_20:
+
+ INIT1x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L4_M1_40
+
+dgemm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M1_22
+
+
+dgemm_kernel_L4_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L4_M1_100
+
+dgemm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L4_M1_42
+
+dgemm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+dgemm_kernel_L4_END:
+
+ lsl temp, origK, #5
+ add origPB, origPB, temp // B = B + K * 4 * 8
+
+ subs counterJ, counterJ , #1 // j--
+ bgt dgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble dgemm_kernel_L999 // error, N was less than 4?
+
+ tst counterJ , #2
+ ble dgemm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+ mov pA, origPA // pA = A
+
+dgemm_kernel_L2_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble dgemm_kernel_L2_M4_BEGIN
+
+dgemm_kernel_L2_M8_20:
+
+ INIT8x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dgemm_kernel_L2_M8_40
+ .align 5
+
+dgemm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M8_22
+
+
+dgemm_kernel_L2_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L2_M8_100
+
+dgemm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M8_42
+
+dgemm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+dgemm_kernel_L2_M8_END:
+
+ subs counterI, counterI, #1
+ bgt dgemm_kernel_L2_M8_20
+
+dgemm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble dgemm_kernel_L2_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble dgemm_kernel_L2_M2_BEGIN
+
+dgemm_kernel_L2_M4_20:
+
+ INIT4x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dgemm_kernel_L2_M4_40
+ .align 5
+
+dgemm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M4_22
+
+
+dgemm_kernel_L2_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L2_M4_100
+
+dgemm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M4_42
+
+dgemm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+dgemm_kernel_L2_M4_END:
+
+
+dgemm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dgemm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dgemm_kernel_L2_M1_BEGIN
+
+dgemm_kernel_L2_M2_20:
+
+ INIT2x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dgemm_kernel_L2_M2_40
+
+dgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M2_22
+
+
+dgemm_kernel_L2_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L2_M2_100
+
+dgemm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M2_42
+
+dgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+dgemm_kernel_L2_M2_END:
+
+
+dgemm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dgemm_kernel_L2_END
+
+dgemm_kernel_L2_M1_20:
+
+ INIT1x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble dgemm_kernel_L2_M1_40
+
+dgemm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M1_22
+
+
+dgemm_kernel_L2_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L2_M1_100
+
+dgemm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L2_M1_42
+
+dgemm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+dgemm_kernel_L2_END:
+ add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dgemm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble dgemm_kernel_L999 // done
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+ mov pA, origPA // pA = A
+
+dgemm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble dgemm_kernel_L1_M4_BEGIN
+
+dgemm_kernel_L1_M8_20:
+
+ INIT8x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L1_M8_40
+ .align 5
+
+dgemm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M8_22
+
+
+dgemm_kernel_L1_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L1_M8_100
+
+dgemm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M8_42
+
+dgemm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+dgemm_kernel_L1_M8_END:
+
+ subs counterI, counterI, #1
+ bgt dgemm_kernel_L1_M8_20
+
+dgemm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble dgemm_kernel_L1_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble dgemm_kernel_L1_M2_BEGIN
+
+dgemm_kernel_L1_M4_20:
+
+ INIT4x1
+
+ mov pB, origPB
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L1_M4_40
+ .align 5
+
+dgemm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M4_22
+
+
+dgemm_kernel_L1_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L1_M4_100
+
+dgemm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M4_42
+
+dgemm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+dgemm_kernel_L1_M4_END:
+
+dgemm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dgemm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dgemm_kernel_L1_M1_BEGIN
+
+dgemm_kernel_L1_M2_20:
+
+ INIT2x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L1_M2_40
+
+dgemm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M2_22
+
+
+dgemm_kernel_L1_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L1_M2_100
+
+dgemm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M2_42
+
+dgemm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+dgemm_kernel_L1_M2_END:
+
+
+dgemm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dgemm_kernel_L1_END
+
+dgemm_kernel_L1_M1_20:
+
+ INIT1x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dgemm_kernel_L1_M1_40
+
+dgemm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M1_22
+
+
+dgemm_kernel_L1_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble dgemm_kernel_L1_M1_100
+
+dgemm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dgemm_kernel_L1_M1_42
+
+dgemm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+
+dgemm_kernel_L1_END:
+
+
+dgemm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S
new file mode 100755
index 000000000..eb7397faa
--- /dev/null
+++ b/kernel/arm64/dtrmm_kernel_4x8.S
@@ -0,0 +1,2026 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+#define temp x16
+#define tempOffset x17
+#define tempK x18
+
+#define alpha0 d2
+#define alphaV0 v2.d[0]
+#define alpha1 d3
+#define alphaV1 v3.d[0]
+#define alpha2 d6
+#define alphaV2 v6.d[0]
+#define alpha3 d7
+#define alphaV3 v7.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA00, pA01
+//v01 pA02, pA03
+//v02 ALPHA0
+//v03 ALPHA1
+//v04 pA10, pA11
+//v05 pA12, pA13
+//v06 ALPHA2
+//v07 ALPHA3
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save pB0_4, pB0_5
+//v11 must save pB0_6, pB0_7
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save pB1_4, pB1_5
+//v15 must save pB1_6, pB1_7
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT4x8
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, xzr
+ fmov d19, d16
+ fmov d20, xzr
+ fmov d21, d16
+ fmov d22, d17
+ fmov d23, d18
+ fmov d24, xzr
+ fmov d25, d16
+ fmov d26, d17
+ fmov d27, d18
+ fmov d28, xzr
+ fmov d29, d16
+ fmov d30, d17
+ fmov d31, d18
+.endm
+
+.macro KERNEL4x8_I
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmul v16.2d, v0.2d, v8.2d[0]
+ fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v18.2d, v0.2d, v8.2d[1]
+ fmul v19.2d, v1.2d, v8.2d[1]
+
+ fmul v20.2d, v0.2d, v9.2d[0]
+ fmul v21.2d, v1.2d, v9.2d[0]
+ fmul v22.2d, v0.2d, v9.2d[1]
+ fmul v23.2d, v1.2d, v9.2d[1]
+
+ fmul v24.2d, v0.2d, v10.2d[0]
+ fmul v25.2d, v1.2d, v10.2d[0]
+ fmul v26.2d, v0.2d, v10.2d[1]
+ fmul v27.2d, v1.2d, v10.2d[1]
+
+ fmul v28.2d, v0.2d, v11.2d[0]
+ fmul v29.2d, v1.2d, v11.2d[0]
+ fmul v30.2d, v0.2d, v11.2d[1]
+ fmul v31.2d, v1.2d, v11.2d[1]
+
+ ld1 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v4.2d, v5.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
+.endm
+
+.macro KERNEL4x8_M1
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v19.2d, v1.2d, v8.2d[1]
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v23.2d, v1.2d, v9.2d[1]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v27.2d, v1.2d, v10.2d[1]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v31.2d, v1.2d, v11.2d[1]
+
+ ld1 {v12.2d, v13.2d}, [pB] // For next round
+ add pB, pB, #32
+ ld1 {v4.2d, v5.2d}, [pA] // For next round
+ add pA, pA, #32
+ ld1 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
+
+ prfm PLDL1KEEP, [pA, #512]
+.endm
+
+.macro KERNEL4x8_M2
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v18.2d, v4.2d, v12.2d[1]
+ fmla v19.2d, v5.2d, v12.2d[1]
+
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v22.2d, v4.2d, v13.2d[1]
+ fmla v23.2d, v5.2d, v13.2d[1]
+
+ fmla v24.2d, v4.2d, v14.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v26.2d, v4.2d, v14.2d[1]
+ fmla v27.2d, v5.2d, v14.2d[1]
+
+ fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v30.2d, v4.2d, v15.2d[1]
+ fmla v31.2d, v5.2d, v15.2d[1]
+
+ ld1 {v8.2d, v9.2d}, [pB] // For next round
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA] // For next round
+ add pA, pA, #32
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ prfm PLDL1KEEP, [pB, #512]
+.endm
+
+.macro KERNEL4x8_E
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v18.2d, v4.2d, v12.2d[1]
+ fmla v19.2d, v5.2d, v12.2d[1]
+
+ fmla v20.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v22.2d, v4.2d, v13.2d[1]
+ fmla v23.2d, v5.2d, v13.2d[1]
+
+ fmla v24.2d, v4.2d, v14.2d[0]
+ fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v26.2d, v4.2d, v14.2d[1]
+ fmla v27.2d, v5.2d, v14.2d[1]
+
+ fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v30.2d, v4.2d, v15.2d[1]
+ fmla v31.2d, v5.2d, v15.2d[1]
+.endm
+
+.macro KERNEL4x8_SUB
+ ld1 {v8.2d, v9.2d}, [pB] // For next round
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA] // For next round
+ add pA, pA, #32
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v19.2d, v1.2d, v8.2d[1]
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v23.2d, v1.2d, v9.2d[1]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v27.2d, v1.2d, v10.2d[1]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v31.2d, v1.2d, v11.2d[1]
+.endm
+
+.macro SAVE4x8
+ add pCRow1, pCRow0, LDC
+
+ fmul v8.2d, v16.2d, alphaV0
+ fmul v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v10.2d, v18.2d, alphaV2
+ fmul v11.2d, v19.2d, alphaV3
+ st1 {v10.2d, v11.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v20.2d, alphaV0
+ fmul v13.2d, v21.2d, alphaV1
+ st1 {v12.2d, v13.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v14.2d, v22.2d, alphaV2
+ fmul v15.2d, v23.2d, alphaV3
+ st1 {v14.2d, v15.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v8.2d, v24.2d, alphaV0
+ fmul v9.2d, v25.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v10.2d, v26.2d, alphaV2
+ fmul v11.2d, v27.2d, alphaV3
+ st1 {v10.2d, v11.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v28.2d, alphaV0
+ fmul v13.2d, v29.2d, alphaV1
+ st1 {v12.2d, v13.2d}, [pCRow2]
+
+ fmul v14.2d, v30.2d, alphaV2
+ fmul v15.2d, v31.2d, alphaV3
+ st1 {v14.2d, v15.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+ fmov d16, xzr
+ fmov d18, xzr
+ fmov d20, xzr
+ fmov d22, d16
+ fmov d24, xzr
+ fmov d26, d16
+ fmov d28, xzr
+ fmov d30, d16
+.endm
+
+.macro KERNEL2x8_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v18.2d, v0.2d, v8.2d[1]
+
+ fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v22.2d, v0.2d, v9.2d[1]
+
+ fmla v24.2d, v0.2d, v10.2d[0]
+ fmla v26.2d, v0.2d, v10.2d[1]
+
+ fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v30.2d, v0.2d, v11.2d[1]
+.endm
+
+.macro SAVE2x8
+ add pCRow1, pCRow0, LDC
+
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v10.2d, v18.2d, alphaV2
+ st1 {v10.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v20.2d, alphaV0
+ st1 {v12.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v14.2d, v22.2d, alphaV2
+ st1 {v14.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v8.2d, v24.2d, alphaV0
+ st1 {v8.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v10.2d, v26.2d, alphaV2
+ st1 {v10.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v28.2d, alphaV0
+ st1 {v12.2d}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v14.2d, v30.2d, alphaV2
+ st1 {v14.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+ fmov d16, xzr
+ fmov d20, xzr
+ fmov d24, xzr
+ fmov d28, xzr
+.endm
+
+.macro KERNEL1x8_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ldr d0, [pA]
+ add pA, pA, #8
+ ld1 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v8.2d, v0.d[0]
+ fmla v20.2d, v9.2d, v0.d[0]
+ fmla v24.2d, v10.2d, v0.d[0]
+ fmla v28.2d, v11.2d, v0.d[0]
+.endm
+
+.macro SAVE1x8
+ add pCRow1, pCRow0, LDC
+
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ fmul v10.2d, v20.2d, alphaV1
+ st1 {v10.d}[0], [pCRow2]
+ st1 {v10.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v24.2d, alphaV2
+ st1 {v12.d}[0], [pCRow2]
+ st1 {v12.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ fmul v14.2d, v28.2d, alphaV3
+ st1 {v14.d}[0], [pCRow2]
+ st1 {v14.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+ fmov d24, d17
+ fmov d25, d16
+ fmov d28, d17
+ fmov d29, d16
+.endm
+
+.macro KERNEL4x4_I
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmul v16.2d, v0.2d, v8.2d[0]
+ fmul v29.2d, v1.2d, v9.2d[1]
+
+ fmul v20.2d, v0.2d, v8.2d[1]
+ fmul v25.2d, v1.2d, v9.2d[0]
+
+ fmul v24.2d, v0.2d, v9.2d[0]
+ fmul v21.2d, v1.2d, v8.2d[1]
+
+ fmul v28.2d, v0.2d, v9.2d[1]
+ fmul v17.2d, v1.2d, v8.2d[0]
+
+ ld1 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v4.2d, v5.2d}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v29.2d, v1.2d, v9.2d[1]
+
+ ld1 {v12.2d, v13.2d}, [pB] // For next round
+ add pB, pB, #32
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v25.2d, v1.2d, v9.2d[0]
+
+ ld1 {v4.2d, v5.2d}, [pA] // For next round
+ add pA, pA, #32
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v8.2d[1]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro KERNEL4x4_M2
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v29.2d, v5.2d, v13.2d[1]
+
+ ld1 {v8.2d, v9.2d}, [pB] // For next round
+ add pB, pB, #32
+
+ fmla v20.2d, v4.2d, v12.2d[1]
+ fmla v25.2d, v5.2d, v13.2d[0]
+
+ ld1 {v0.2d, v1.2d}, [pA] // For next round
+ add pA, pA, #32
+
+ fmla v24.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v12.2d[1]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ fmla v28.2d, v4.2d, v13.2d[1]
+ fmla v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_E
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v29.2d, v5.2d, v13.2d[1]
+
+ fmla v20.2d, v4.2d, v12.2d[1]
+ fmla v25.2d, v5.2d, v13.2d[0]
+
+ fmla v24.2d, v4.2d, v13.2d[0]
+ fmla v21.2d, v5.2d, v12.2d[1]
+
+ fmla v28.2d, v4.2d, v13.2d[1]
+ fmla v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v29.2d, v1.2d, v9.2d[1]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v25.2d, v1.2d, v9.2d[0]
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v8.2d[1]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+ fmul v8.2d, v16.2d, alphaV0
+ fmul v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV2
+ fmul v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v8.2d, v24.2d, alphaV0
+ fmul v9.2d, v25.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v28.2d, alphaV2
+ fmul v13.2d, v29.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov d16, xzr
+ fmov d20, d16
+ fmov d24, d20
+ fmov d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v8.2d, v24.2d, alphaV2
+ st1 {v8.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v28.2d, alphaV3
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr d0, [pA]
+ add pA, pA, #8
+
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v8.2d, v0.d[0]
+ fmla v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v20.2d, alphaV1
+ st1 {v12.d}[0], [pCRow2]
+ st1 {v12.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+ fmul v8.2d, v16.2d, alphaV0
+ fmul v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV2
+ fmul v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2d} , [pB]
+ add pB , pB, #16
+
+ ldr d0 , [pA]
+ add pA, pA, #8
+
+ fmla v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov d16, xzr
+ fmov d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA , pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+ fmul v8.2d, v16.2d, alphaV0
+ fmul v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ldr d0, [pA]
+ add pA , pA, #8
+
+ fmadd d16, d0, d8, d16
+.endm
+
+.macro SAVE1x1
+ fmul d8, d16, alpha0
+ str d8, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0, d0
+ fmov alpha1, d0
+ fmov alpha2, d0
+ fmov alpha3, d0
+
+ lsl LDC, LDC, #3 // ldc = ldc * 8
+
+#if !defined(LEFT)
+ neg tempOffset, offset
+#endif
+
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #3 // J = J / 8
+ cmp counterJ, #0
+ ble dtrmm_kernel_L4_BEGIN
+
+/******************************************************************************/
+
+dtrmm_kernel_L8_BEGIN:
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #3
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+
+ mov pA, origPA // pA = start of A array
+
+dtrmm_kernel_L8_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble dtrmm_kernel_L8_M2_BEGIN
+
+dtrmm_kernel_L8_M4_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ lsl temp, tempOffset, #6
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #8
+#endif
+
+ asr counterL, tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt dtrmm_kernel_L8_M4_32
+
+ KERNEL4x8_I // do one in the K
+ KERNEL4x8_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble dtrmm_kernel_L8_M4_22a
+ .align 5
+
+dtrmm_kernel_L8_M4_22:
+
+ KERNEL4x8_M1
+ KERNEL4x8_M2
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L8_M4_22
+
+
+dtrmm_kernel_L8_M4_22a:
+
+ KERNEL4x8_M1
+ KERNEL4x8_E
+
+ b dtrmm_kernel_L8_M4_44
+
+dtrmm_kernel_L8_M4_32:
+
+ tst counterL, #1
+ ble dtrmm_kernel_L8_M4_40
+
+ KERNEL4x8_I
+
+ KERNEL4x8_E
+
+ b dtrmm_kernel_L8_M4_44
+
+
+dtrmm_kernel_L8_M4_40:
+
+ INIT4x8
+
+dtrmm_kernel_L8_M4_44:
+
+ ands counterL, tempK, #1
+ ble dtrmm_kernel_L8_M4_100
+
+dtrmm_kernel_L8_M4_46:
+
+ KERNEL4x8_SUB
+
+dtrmm_kernel_L8_M4_100:
+
+ SAVE4x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #8
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #6
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L8_M4_END:
+ subs counterI, counterI, #1
+ bne dtrmm_kernel_L8_M4_20
+
+dtrmm_kernel_L8_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dtrmm_kernel_L8_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dtrmm_kernel_L8_M1_BEGIN
+
+dtrmm_kernel_L8_M2_20:
+
+ INIT2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+ lsl temp, tempOffset, #6
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #8
+#endif
+
+ asr counterL, tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L8_M2_40
+
+dtrmm_kernel_L8_M2_22:
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L8_M2_22
+
+
+dtrmm_kernel_L8_M2_40:
+
+ ands counterL, tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L8_M2_100
+
+dtrmm_kernel_L8_M2_42:
+
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L8_M2_42
+
+dtrmm_kernel_L8_M2_100:
+
+ SAVE2x8
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #8
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #6
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L8_M2_END:
+
+
+dtrmm_kernel_L8_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dtrmm_kernel_L8_END
+
+dtrmm_kernel_L8_M1_20:
+
+ INIT1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+ lsl temp, tempOffset, #6
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #8
+#endif
+
+ asr counterL, tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L8_M1_40
+
+dtrmm_kernel_L8_M1_22:
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L8_M1_22
+
+
+dtrmm_kernel_L8_M1_40:
+
+ ands counterL, tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L8_M1_100
+
+dtrmm_kernel_L8_M1_42:
+
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L8_M1_42
+
+dtrmm_kernel_L8_M1_100:
+
+ SAVE1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #8
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #6
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+
+dtrmm_kernel_L8_END:
+
+ lsl temp, origK, #6
+ add origPB, origPB, temp // B = B + K * 8 * 8
+
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+ subs counterJ, counterJ , #1 // j--
+ bgt dtrmm_kernel_L8_BEGIN
+
+
+/******************************************************************************/
+
+dtrmm_kernel_L4_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #7
+ ble dtrmm_kernel_L999
+
+ tst counterJ , #4
+ ble dtrmm_kernel_L2_BEGIN
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+
+ mov pA, origPA // pA = start of A array
+
+dtrmm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble dtrmm_kernel_L4_M2_BEGIN
+
+dtrmm_kernel_L4_M4_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL, tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt dtrmm_kernel_L4_M4_32
+
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble dtrmm_kernel_L4_M4_22a
+ .align 5
+
+dtrmm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M4_22
+
+
+dtrmm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b dtrmm_kernel_L4_M4_44
+
+dtrmm_kernel_L4_M4_32:
+
+ tst counterL, #1
+ ble dtrmm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+
+ KERNEL4x4_E
+
+ b dtrmm_kernel_L4_M4_44
+
+
+dtrmm_kernel_L4_M4_40:
+
+ INIT4x4
+
+dtrmm_kernel_L4_M4_44:
+
+ ands counterL , tempK, #1
+ ble dtrmm_kernel_L4_M4_100
+
+dtrmm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+dtrmm_kernel_L4_M4_100:
+
+ SAVE4x4
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L4_M4_END:
+ subs counterI, counterI, #1
+ bne dtrmm_kernel_L4_M4_20
+
+dtrmm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dtrmm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dtrmm_kernel_L4_M1_BEGIN
+
+dtrmm_kernel_L4_M2_20:
+
+ INIT2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L4_M2_40
+
+dtrmm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M2_22
+
+
+dtrmm_kernel_L4_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L4_M2_100
+
+dtrmm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M2_42
+
+dtrmm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+dtrmm_kernel_L4_M2_END:
+
+
+dtrmm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dtrmm_kernel_L4_END
+
+dtrmm_kernel_L4_M1_20:
+
+ INIT1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L4_M1_40
+
+dtrmm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M1_22
+
+
+dtrmm_kernel_L4_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L4_M1_100
+
+dtrmm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M1_42
+
+dtrmm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+dtrmm_kernel_L4_END:
+
+ lsl temp, origK, #5
+ add origPB, origPB, temp // B = B + K * 4 * 8
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+/******************************************************************************/
+
+dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble dtrmm_kernel_L999 // error, N was less than 4?
+
+ tst counterJ , #2
+ ble dtrmm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+
+dtrmm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI,#0
+ ble dtrmm_kernel_L2_M2_BEGIN
+
+dtrmm_kernel_L2_M4_20:
+
+ INIT4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dtrmm_kernel_L2_M4_40
+ .align 5
+
+dtrmm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M4_22
+
+
+dtrmm_kernel_L2_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L2_M4_100
+
+dtrmm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M4_42
+
+dtrmm_kernel_L2_M4_100:
+
+ SAVE4x2
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L2_M4_END:
+
+ subs counterI, counterI, #1
+ bgt dtrmm_kernel_L2_M4_20
+
+
+dtrmm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dtrmm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dtrmm_kernel_L2_M1_BEGIN
+
+dtrmm_kernel_L2_M2_20:
+
+ INIT2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dtrmm_kernel_L2_M2_40
+
+dtrmm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M2_22
+
+
+dtrmm_kernel_L2_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L2_M2_100
+
+dtrmm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M2_42
+
+dtrmm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+dtrmm_kernel_L2_M2_END:
+
+
+dtrmm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dtrmm_kernel_L2_END
+
+dtrmm_kernel_L2_M1_20:
+
+ INIT1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble dtrmm_kernel_L2_M1_40
+
+dtrmm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M1_22
+
+
+dtrmm_kernel_L2_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L2_M1_100
+
+dtrmm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M1_42
+
+dtrmm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+dtrmm_kernel_L2_END:
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+ add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dtrmm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble dtrmm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+dtrmm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #2 // counterI = counterI / 4
+ cmp counterI, #0
+ ble dtrmm_kernel_L1_M2_BEGIN
+
+dtrmm_kernel_L1_M4_20:
+
+ INIT4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L1_M4_40
+ .align 5
+
+dtrmm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M4_22
+
+
+dtrmm_kernel_L1_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L1_M4_100
+
+dtrmm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M4_42
+
+dtrmm_kernel_L1_M4_100:
+
+ SAVE4x1
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L1_M4_END:
+
+ subs counterI, counterI, #1
+ bgt dtrmm_kernel_L1_M4_20
+
+
+dtrmm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dtrmm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dtrmm_kernel_L1_M1_BEGIN
+
+dtrmm_kernel_L1_M2_20:
+
+ INIT2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L1_M2_40
+
+dtrmm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M2_22
+
+
+dtrmm_kernel_L1_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L1_M2_100
+
+dtrmm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M2_42
+
+dtrmm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+dtrmm_kernel_L1_M2_END:
+
+
+dtrmm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dtrmm_kernel_L1_END
+
+dtrmm_kernel_L1_M1_20:
+
+ INIT1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L1_M1_40
+
+dtrmm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M1_22
+
+
+dtrmm_kernel_L1_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L1_M1_100
+
+dtrmm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M1_42
+
+dtrmm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+
+dtrmm_kernel_L1_END:
+
+
+dtrmm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S
new file mode 100755
index 000000000..6890505bd
--- /dev/null
+++ b/kernel/arm64/dtrmm_kernel_8x4.S
@@ -0,0 +1,1849 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+#define temp x16
+#define tempOffset x17
+#define tempK x18
+
+#define alpha0 d10
+#define alphaV0 v10.d[0]
+#define alpha1 d11
+#define alphaV1 v11.d[0]
+#define alpha2 d14
+#define alphaV2 v14.d[0]
+#define alpha3 d15
+#define alphaV3 v15.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1
+//v01 pA0_2, pA0_3
+//v02 pA0_4, pA0_5
+//v03 pA0_6, pA0_7
+//v04 pA1_0, pA1_1
+//v05 pA1_2, pA1_3
+//v06 pA1_4, pA1_5
+//v07 pA1_6, pA1_7
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, d16
+ fmov d19, xzr
+ fmov d20, xzr
+ fmov d21, d16
+ fmov d22, d17
+ fmov d23, d18
+ fmov d24, xzr
+ fmov d25, d16
+ fmov d26, d17
+ fmov d27, d18
+ fmov d28, xzr
+ fmov d29, d16
+ fmov d30, d17
+ fmov d31, d18
+.endm
+
+.macro KERNEL8x4_I
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+
+ fmul v16.2d, v0.2d, v8.2d[0]
+ fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v18.2d, v2.2d, v8.2d[0]
+ fmul v19.2d, v3.2d, v8.2d[0]
+
+ fmul v20.2d, v0.2d, v8.2d[1]
+ fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v22.2d, v2.2d, v8.2d[1]
+ fmul v23.2d, v3.2d, v8.2d[1]
+
+ fmul v24.2d, v0.2d, v9.2d[0]
+ fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v26.2d, v2.2d, v9.2d[0]
+ fmul v27.2d, v3.2d, v9.2d[0]
+
+ fmul v28.2d, v0.2d, v9.2d[1]
+ fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v30.2d, v2.2d, v9.2d[1]
+ fmul v31.2d, v3.2d, v9.2d[1]
+
+ ld1 {v4.2d, v5.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v6.2d, v7.2d}, [pA]
+ add pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M1
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v22.2d, v2.2d, v8.2d[1]
+ fmla v23.2d, v3.2d, v8.2d[1]
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v26.2d, v2.2d, v9.2d[0]
+ fmla v27.2d, v3.2d, v9.2d[0]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v30.2d, v2.2d, v9.2d[1]
+ fmla v31.2d, v3.2d, v9.2d[1]
+
+ ld1 {v4.2d, v5.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v6.2d, v7.2d}, [pA]
+ add pA, pA, #32
+
+ prfm PLDL1KEEP, [pA, #512]
+.endm
+
+.macro KERNEL8x4_M2
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v18.2d, v6.2d, v12.2d[0]
+ fmla v19.2d, v7.2d, v12.2d[0]
+
+ fmla v20.2d, v4.2d, v12.2d[1]
+ fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v22.2d, v6.2d, v12.2d[1]
+ fmla v23.2d, v7.2d, v12.2d[1]
+
+ fmla v24.2d, v4.2d, v13.2d[0]
+ fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v26.2d, v6.2d, v13.2d[0]
+ fmla v27.2d, v7.2d, v13.2d[0]
+
+ fmla v28.2d, v4.2d, v13.2d[1]
+ fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v30.2d, v6.2d, v13.2d[1]
+ fmla v31.2d, v7.2d, v13.2d[1]
+
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+
+ prfm PLDL1KEEP, [pB, #512]
+.endm
+
+.macro KERNEL8x4_E
+ fmla v16.2d, v4.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v18.2d, v6.2d, v12.2d[0]
+ fmla v19.2d, v7.2d, v12.2d[0]
+
+ fmla v20.2d, v4.2d, v12.2d[1]
+ fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v22.2d, v6.2d, v12.2d[1]
+ fmla v23.2d, v7.2d, v12.2d[1]
+
+ fmla v24.2d, v4.2d, v13.2d[0]
+ fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v26.2d, v6.2d, v13.2d[0]
+ fmla v27.2d, v7.2d, v13.2d[0]
+
+ fmla v28.2d, v4.2d, v13.2d[1]
+ fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v30.2d, v6.2d, v13.2d[1]
+ fmla v31.2d, v7.2d, v13.2d[1]
+.endm
+
+.macro KERNEL8x4_SUB
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v22.2d, v2.2d, v8.2d[1]
+ fmla v23.2d, v3.2d, v8.2d[1]
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v26.2d, v2.2d, v9.2d[0]
+ fmla v27.2d, v3.2d, v9.2d[0]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v30.2d, v2.2d, v9.2d[1]
+ fmla v31.2d, v3.2d, v9.2d[1]
+.endm
+
+.macro SAVE8x4
+ add pCRow1, pCRow0, LDC
+
+ fmul v0.2d, v16.2d, alphaV0
+ fmul v1.2d, v17.2d, alphaV1
+ fmul v2.2d, v18.2d, alphaV2
+ fmul v3.2d, v19.2d, alphaV3
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v4.2d, v20.2d, alphaV0
+ fmul v5.2d, v21.2d, alphaV1
+ fmul v6.2d, v22.2d, alphaV2
+ fmul v7.2d, v23.2d, alphaV3
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v0.2d, v24.2d, alphaV0
+ fmul v1.2d, v25.2d, alphaV1
+ fmul v2.2d, v26.2d, alphaV2
+ fmul v3.2d, v27.2d, alphaV3
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
+
+ fmul v4.2d, v28.2d, alphaV0
+ fmul v5.2d, v29.2d, alphaV1
+ fmul v6.2d, v30.2d, alphaV2
+ fmul v7.2d, v31.2d, alphaV3
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+ fmov d24, d17
+ fmov d25, d16
+ fmov d28, d17
+ fmov d29, d16
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v29.2d, v1.2d, v9.2d[1]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v25.2d, v1.2d, v9.2d[0]
+
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v21.2d, v1.2d, v8.2d[1]
+
+ fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+ fmul v8.2d, v16.2d, alphaV0
+ fmul v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV2
+ fmul v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v8.2d, v24.2d, alphaV0
+ fmul v9.2d, v25.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v28.2d, alphaV2
+ fmul v13.2d, v29.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT2x4
+ fmov d16, xzr
+ fmov d20, d16
+ fmov d24, d20
+ fmov d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.2d[0]
+ fmla v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v8.2d, v24.2d, alphaV2
+ st1 {v8.2d}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v28.2d, alphaV3
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr d0, [pA]
+ add pA, pA, #8
+
+ ld1 {v8.2d, v9.2d}, [pB]
+ add pB, pB, #32
+
+ fmla v16.2d, v8.2d, v0.d[0]
+ fmla v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2d, v20.2d, alphaV1
+ st1 {v12.d}[0], [pCRow2]
+ st1 {v12.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, d16
+ fmov d19, d17
+ fmov d20, xzr
+ fmov d21, d16
+ fmov d22, d17
+ fmov d23, d18
+.endm
+
+.macro KERNEL8x2_SUB
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v22.2d, v2.2d, v8.2d[1]
+ fmla v23.2d, v3.2d, v8.2d[1]
+.endm
+
+.macro SAVE8x2
+ add pCRow1, pCRow0, LDC
+
+ fmul v0.2d, v16.2d, alphaV0
+ fmul v1.2d, v17.2d, alphaV1
+ fmul v2.2d, v18.2d, alphaV2
+ fmul v3.2d, v19.2d, alphaV3
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+ fmul v4.2d, v20.2d, alphaV0
+ fmul v5.2d, v21.2d, alphaV1
+ fmul v6.2d, v22.2d, alphaV2
+ fmul v7.2d, v23.2d, alphaV3
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov d16, xzr
+ fmov d17, d16
+ fmov d20, d17
+ fmov d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+ fmul v8.2d, v16.2d, alphaV0
+ fmul v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV2
+ fmul v13.2d, v21.2d, alphaV3
+ st1 {v12.2d, v13.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov d16, xzr
+ fmov d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2d}, [pB]
+ add pB, pB, #16
+
+ ld1 {v0.2d}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+
+ fmul v12.2d, v20.2d, alphaV1
+ st1 {v12.2d}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2d} , [pB]
+ add pB , pB, #16
+
+ ldr d0 , [pA]
+ add pA, pA, #8
+
+ fmla v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.d}[0], [pCRow0]
+ st1 {v8.d}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov d16, xzr
+ fmov d17, xzr
+ fmov d18, d16
+ fmov d19, d17
+.endm
+
+.macro KERNEL8x1_SUB
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA , pA, #32
+
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v18.2d, v2.2d, v8.2d[0]
+ fmla v19.2d, v3.2d, v8.2d[0]
+.endm
+
+.macro SAVE8x1
+ fmul v0.2d, v16.2d, alphaV0
+ fmul v1.2d, v17.2d, alphaV1
+ fmul v2.2d, v18.2d, alphaV2
+ fmul v3.2d, v19.2d, alphaV3
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov d16, xzr
+ fmov d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d, v1.2d}, [pA]
+ add pA , pA, #32
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+ fmul v8.2d, v16.2d, alphaV0
+ fmul v9.2d, v17.2d, alphaV1
+ st1 {v8.2d, v9.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ld1 {v0.2d}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+ fmul v8.2d, v16.2d, alphaV0
+ st1 {v8.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr d8, [pB]
+ add pB , pB, #8
+
+ ldr d0, [pA]
+ add pA , pA, #8
+
+ fmadd d16, d0, d8, d16
+.endm
+
+.macro SAVE1x1
+ fmul d8, d16, alpha0
+ str d8, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0, d0
+ fmov alpha1, d0
+ fmov alpha2, d0
+ fmov alpha3, d0
+
+ lsl LDC, LDC, #3 // ldc = ldc * 8
+
+#if !defined(LEFT)
+ neg tempOffset, offset
+#endif
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #2 // J = J / 4
+ cmp counterJ, #0
+ ble dtrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+dtrmm_kernel_L4_BEGIN:
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = start of A array
+
+dtrmm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble dtrmm_kernel_L4_M4_BEGIN
+
+dtrmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt dtrmm_kernel_L4_M8_32
+
+ KERNEL8x4_I // do one in the K
+ KERNEL8x4_M2 // do another in the K
+
+ subs counterL, counterL, #2 // subtract 2
+ ble dtrmm_kernel_L4_M8_22a
+ .align 5
+
+dtrmm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M8_22
+
+
+dtrmm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b dtrmm_kernel_L4_M8_44
+
+dtrmm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble dtrmm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+
+ KERNEL8x4_E
+
+ b dtrmm_kernel_L4_M8_44
+
+dtrmm_kernel_L4_M8_40:
+
+ INIT8x4
+
+dtrmm_kernel_L4_M8_44:
+
+ ands counterL , tempK, #1
+ ble dtrmm_kernel_L4_M8_100
+
+dtrmm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+dtrmm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+dtrmm_kernel_L4_M8_END:
+ subs counterI, counterI, #1
+ bne dtrmm_kernel_L4_M8_20
+
+dtrmm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble dtrmm_kernel_L4_END
+
+ tst counterI, #4
+ ble dtrmm_kernel_L4_M2_BEGIN
+
+dtrmm_kernel_L4_M4_20:
+
+ INIT4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L4_M4_40
+
+dtrmm_kernel_L4_M4_22:
+
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+ KERNEL4x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M4_22
+
+
+dtrmm_kernel_L4_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L4_M4_100
+
+dtrmm_kernel_L4_M4_42:
+
+ KERNEL4x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M4_42
+
+dtrmm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L4_M4_END:
+
+
+dtrmm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dtrmm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dtrmm_kernel_L4_M1_BEGIN
+
+dtrmm_kernel_L4_M2_20:
+
+ INIT2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L4_M2_40
+
+dtrmm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M2_22
+
+
+dtrmm_kernel_L4_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L4_M2_100
+
+dtrmm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M2_42
+
+dtrmm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L4_M2_END:
+
+
+dtrmm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dtrmm_kernel_L4_END
+
+dtrmm_kernel_L4_M1_20:
+
+ INIT1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L4_M1_40
+
+dtrmm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M1_22
+
+
+dtrmm_kernel_L4_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L4_M1_100
+
+dtrmm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L4_M1_42
+
+dtrmm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+
+dtrmm_kernel_L4_END:
+
+ lsl temp, origK, #5
+ add origPB, origPB, temp // B = B + K * 4 * 8
+
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+ subs counterJ, counterJ , #1 // j--
+ bgt dtrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble dtrmm_kernel_L999 // error, N was less than 4?
+
+ tst counterJ , #2
+ ble dtrmm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+dtrmm_kernel_L2_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble dtrmm_kernel_L2_M4_BEGIN
+
+dtrmm_kernel_L2_M8_20:
+
+ INIT8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dtrmm_kernel_L2_M8_40
+ .align 5
+
+dtrmm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M8_22
+
+
+dtrmm_kernel_L2_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L2_M8_100
+
+dtrmm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M8_42
+
+dtrmm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+dtrmm_kernel_L2_M8_END:
+
+ subs counterI, counterI, #1
+ bgt dtrmm_kernel_L2_M8_20
+
+dtrmm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble dtrmm_kernel_L2_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble dtrmm_kernel_L2_M2_BEGIN
+
+dtrmm_kernel_L2_M4_20:
+
+ INIT4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dtrmm_kernel_L2_M4_40
+ .align 5
+
+dtrmm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M4_22
+
+
+dtrmm_kernel_L2_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L2_M4_100
+
+dtrmm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M4_42
+
+dtrmm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L2_M4_END:
+
+
+dtrmm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dtrmm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dtrmm_kernel_L2_M1_BEGIN
+
+dtrmm_kernel_L2_M2_20:
+
+ INIT2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble dtrmm_kernel_L2_M2_40
+
+dtrmm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M2_22
+
+
+dtrmm_kernel_L2_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L2_M2_100
+
+dtrmm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M2_42
+
+dtrmm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L2_M2_END:
+
+
+dtrmm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dtrmm_kernel_L2_END
+
+dtrmm_kernel_L2_M1_20:
+
+ INIT1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble dtrmm_kernel_L2_M1_40
+
+dtrmm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M1_22
+
+
+dtrmm_kernel_L2_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L2_M1_100
+
+dtrmm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L2_M1_42
+
+dtrmm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+
+dtrmm_kernel_L2_END:
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+ add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dtrmm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble dtrmm_kernel_L999 // done
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+dtrmm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble dtrmm_kernel_L1_M4_BEGIN
+
+dtrmm_kernel_L1_M8_20:
+
+ INIT8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L1_M8_40
+ .align 5
+
+dtrmm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M8_22
+
+
+dtrmm_kernel_L1_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L1_M8_100
+
+dtrmm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M8_42
+
+dtrmm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+dtrmm_kernel_L1_M8_END:
+
+ subs counterI, counterI, #1
+ bgt dtrmm_kernel_L1_M8_20
+
+dtrmm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble dtrmm_kernel_L1_END
+
+ tst counterI, #4 // counterI = counterI / 2
+ ble dtrmm_kernel_L1_M2_BEGIN
+
+dtrmm_kernel_L1_M4_20:
+
+ INIT4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L1_M4_40
+ .align 5
+
+dtrmm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M4_22
+
+
+dtrmm_kernel_L1_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L1_M4_100
+
+dtrmm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M4_42
+
+dtrmm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L1_M4_END:
+
+dtrmm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble dtrmm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble dtrmm_kernel_L1_M1_BEGIN
+
+dtrmm_kernel_L1_M2_20:
+
+ INIT2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L1_M2_40
+
+dtrmm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M2_22
+
+
+dtrmm_kernel_L1_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L1_M2_100
+
+dtrmm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M2_42
+
+dtrmm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L1_M2_END:
+
+
+dtrmm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble dtrmm_kernel_L1_END
+
+dtrmm_kernel_L1_M1_20:
+
+ INIT1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble dtrmm_kernel_L1_M1_40
+
+dtrmm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M1_22
+
+
+dtrmm_kernel_L1_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble dtrmm_kernel_L1_M1_100
+
+dtrmm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt dtrmm_kernel_L1_M1_42
+
+dtrmm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+
+dtrmm_kernel_L1_END:
+
+
+dtrmm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S
new file mode 100644
index 000000000..22b55b01c
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_16x4.S
@@ -0,0 +1,1987 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define temp x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+
+#define alpha0 s10
+#define alphaV0 v10.s[0]
+#define alpha1 s11
+#define alphaV1 v11.s[0]
+#define alpha2 s14
+#define alphaV2 v14.s[0]
+#define alpha3 s15
+#define alphaV3 v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
+//v01 pA0_04, pA0_05, pA0_06, pA0_07
+//v02 pA0_08, pA0_09, pA0_10, pA0_11
+//v03 pA0_12, pA0_13, pA0_14, pA0_15
+//v04 pA1_00, pA1_01, pA1_02, pA1_03
+//v05 pA1_04, pA1_05, pA1_06, pA1_07
+//v06 pA1_08, pA1_09, pA1_10, pA1_11
+//v07 pA1_12, pA1_13, pA1_14, pA1_15
+//v08 must save pB00, pB01
+//v09 must save pB02, pB03
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB10, pB11
+//v13 must save pB12, pB13
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT16x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, s16
+ fmov s19, s17
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s26, s17
+ fmov s27, s18
+ fmov s28, wzr
+ fmov s29, s16
+ fmov s30, s17
+ fmov s31, s18
+.endm
+
+.macro KERNEL16x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v8.2s[0]
+ fmul v17.4s, v1.4s, v8.2s[0]
+ fmul v18.4s, v2.4s, v8.2s[0]
+ fmul v19.4s, v3.4s, v8.2s[0]
+
+ fmul v20.4s, v0.4s, v8.2s[1]
+ fmul v21.4s, v1.4s, v8.2s[1]
+ fmul v22.4s, v2.4s, v8.2s[1]
+ fmul v23.4s, v3.4s, v8.2s[1]
+
+ fmul v24.4s, v0.4s, v9.2s[0]
+ fmul v25.4s, v1.4s, v9.2s[0]
+ fmul v26.4s, v2.4s, v9.2s[0]
+ fmul v27.4s, v3.4s, v9.2s[0]
+
+ fmul v28.4s, v0.4s, v9.2s[1]
+ fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v30.4s, v2.4s, v9.2s[1]
+ fmul v31.4s, v3.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v6.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v7.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M1
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v22.4s, v2.4s, v8.2s[1]
+ fmla v23.4s, v3.4s, v8.2s[1]
+
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v26.4s, v2.4s, v9.2s[0]
+ fmla v27.4s, v3.4s, v9.2s[0]
+
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v30.4s, v2.4s, v9.2s[1]
+ fmla v31.4s, v3.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v6.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v7.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M2
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v18.4s, v6.4s, v12.2s[0]
+ fmla v19.4s, v7.4s, v12.2s[0]
+
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v22.4s, v6.4s, v12.2s[1]
+ fmla v23.4s, v7.4s, v12.2s[1]
+
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v26.4s, v6.4s, v13.2s[0]
+ fmla v27.4s, v7.4s, v13.2s[0]
+
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v30.4s, v6.4s, v13.2s[1]
+ fmla v31.4s, v7.4s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL16x4_E
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v18.4s, v6.4s, v12.2s[0]
+ fmla v19.4s, v7.4s, v12.2s[0]
+
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v22.4s, v6.4s, v12.2s[1]
+ fmla v23.4s, v7.4s, v12.2s[1]
+
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v26.4s, v6.4s, v13.2s[0]
+ fmla v27.4s, v7.4s, v13.2s[0]
+
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v30.4s, v6.4s, v13.2s[1]
+ fmla v31.4s, v7.4s, v13.2s[1]
+.endm
+
+.macro KERNEL16x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v22.4s, v2.4s, v8.2s[1]
+ fmla v23.4s, v3.4s, v8.2s[1]
+
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v26.4s, v2.4s, v9.2s[0]
+ fmla v27.4s, v3.4s, v9.2s[0]
+
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v30.4s, v2.4s, v9.2s[1]
+ fmla v31.4s, v3.4s, v9.2s[1]
+.endm
+
+.macro SAVE16x4
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ fmla v2.4s, v18.4s, alphaV2
+ fmla v3.4s, v19.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0
+ fmla v5.4s, v21.4s, alphaV1
+ fmla v6.4s, v22.4s, alphaV2
+ fmla v7.4s, v23.4s, alphaV3
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+ fmla v0.4s, v24.4s, alphaV0
+ fmla v1.4s, v25.4s, alphaV1
+ fmla v2.4s, v26.4s, alphaV2
+ fmla v3.4s, v27.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+
+ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ fmla v4.4s, v28.4s, alphaV0
+ fmla v5.4s, v29.4s, alphaV1
+ fmla v6.4s, v30.4s, alphaV2
+ fmla v7.4s, v31.4s, alphaV3
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s28, wzr
+ fmov s29, s16
+.endm
+
+.macro KERNEL8x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v8.2s[0]
+ fmul v17.4s, v1.4s, v8.2s[0]
+ fmul v20.4s, v0.4s, v8.2s[1]
+ fmul v21.4s, v1.4s, v8.2s[1]
+ fmul v24.4s, v0.4s, v9.2s[0]
+ fmul v25.4s, v1.4s, v9.2s[0]
+ fmul v28.4s, v0.4s, v9.2s[1]
+ fmul v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0
+ fmla v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow2]
+ fmla v0.4s, v24.4s, alphaV0
+ fmla v1.4s, v25.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow2]
+
+ ld1 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v28.4s, alphaV0
+ fmla v5.4s, v29.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+ fmov s24, s17
+ fmov s25, s16
+ fmov s28, s17
+ fmov s29, s16
+.endm
+
+.macro KERNEL4x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.2s, v0.2s, v8.2s[0]
+ fmul v29.2s, v1.2s, v9.2s[1]
+
+ fmul v20.2s, v0.2s, v8.2s[1]
+ fmul v25.2s, v1.2s, v9.2s[0]
+
+ fmul v24.2s, v0.2s, v9.2s[0]
+ fmul v21.2s, v1.2s, v8.2s[1]
+
+ fmul v28.2s, v0.2s, v9.2s[1]
+ fmul v17.2s, v1.2s, v8.2s[0]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.2s, v5.2s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ ld1 {v4.2s, v5.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ ld1 {v0.2s, v1.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+ ld1 {v8.2s, v9.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ fmla v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+ ld1 {v12.2s, v13.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV2
+ fmla v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ ld1 {v8.2s, v9.2s}, [pCRow2]
+ fmla v8.2s, v24.2s, alphaV0
+ fmla v9.2s, v25.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+ ld1 {v12.2s, v13.2s}, [pCRow1]
+ fmla v12.2s, v28.2s, alphaV2
+ fmla v13.2s, v29.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov s16, wzr
+ fmov s20, s16
+ fmov s24, s20
+ fmov s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+ ld1 {v8.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+ ld1 {v12.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ ld1 {v8.2s}, [pCRow2]
+ fmla v8.2s, v24.2s, alphaV2
+ st1 {v8.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+ ld1 {v12.2s}, [pCRow1]
+ fmla v12.2s, v28.2s, alphaV3
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr s0, [pA]
+ add pA, pA, #4
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
+ fmla v16.2s, v8.2s, v0.s[0]
+ fmla v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+ ld1 {v8.s}[0], [pCRow0]
+ ld1 {v8.s}[1], [pCRow1]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+ ld1 {v12.s}[0], [pCRow2]
+ ld1 {v12.s}[1], [pCRow1]
+ fmla v12.2s, v20.2s, alphaV1
+ st1 {v12.s}[0], [pCRow2]
+ st1 {v12.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, wzr
+ fmov s23, s16
+.endm
+
+.macro KERNEL16x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v22.4s, v2.4s, v8.2s[1]
+ fmla v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE16x2
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ fmla v2.4s, v18.4s, alphaV2
+ fmla v3.4s, v19.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0
+ fmla v5.4s, v21.4s, alphaV1
+ fmla v6.4s, v22.4s, alphaV2
+ fmla v7.4s, v23.4s, alphaV3
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0
+ fmla v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+ ld1 {v8.2s, v9.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ fmla v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+ ld1 {v12.2s, v13.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV2
+ fmla v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+ ld1 {v8.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+ ld1 {v12.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2s} , [pB]
+ add pB , pB, #8
+
+ ldr s0 , [pA]
+ add pA, pA, #4
+
+ fmla v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+ ld1 {v8.s}[0], [pCRow0]
+ ld1 {v8.s}[1], [pCRow1]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x1
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+.endm
+
+.macro KERNEL16x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+.endm
+
+.macro SAVE16x1
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ fmla v2.4s, v18.4s, alphaV2
+ fmla v3.4s, v19.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+ ld1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov s16, wzr
+ fmov s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+ ld1 {v8.2s, v9.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ fmla v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s}, [pA]
+ add pA , pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+ ld1 {v8.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ldr s0, [pA]
+ add pA , pA, #4
+
+ fmadd s16, s0, s8, s16
+.endm
+
+.macro SAVE1x1
+ ldr s8, [pCRow0]
+ fmla s8, s16, alphaV0
+ str s8, [pCRow0]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+sgemm_kernel_begin:
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0, s0
+ fmov alpha1, s0
+ fmov alpha2, s0
+ fmov alpha3, s0
+
+ lsl LDC, LDC, #2 // ldc = ldc * 4
+
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #2 // J = J / 4
+ cmp counterJ, #0
+ ble sgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+sgemm_kernel_L4_BEGIN:
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #2
+
+ mov pA, origPA // pA = start of A array
+
+sgemm_kernel_L4_M16_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #4 // counterI = counterI / 16
+ cmp counterI, #0
+ ble sgemm_kernel_L4_M8_BEGIN
+
+sgemm_kernel_L4_M16_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L4_M16_32
+
+ KERNEL16x4_I // do one in the K
+ KERNEL16x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble sgemm_kernel_L4_M16_22a
+ .align 5
+
+sgemm_kernel_L4_M16_22:
+
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M16_22
+
+sgemm_kernel_L4_M16_22a:
+
+ KERNEL16x4_M1
+ KERNEL16x4_E
+
+ b sgemm_kernel_L4_M16_44
+
+sgemm_kernel_L4_M16_32:
+
+ tst counterL, #1
+ ble sgemm_kernel_L4_M16_40
+
+ KERNEL16x4_I
+ KERNEL16x4_E
+
+ b sgemm_kernel_L4_M16_44
+
+sgemm_kernel_L4_M16_40:
+
+ INIT16x4
+
+sgemm_kernel_L4_M16_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L4_M16_100
+
+sgemm_kernel_L4_M16_46:
+
+ KERNEL16x4_SUB
+
+sgemm_kernel_L4_M16_100:
+
+ SAVE16x4
+
+sgemm_kernel_L4_M16_END:
+ subs counterI, counterI, #1
+ bne sgemm_kernel_L4_M16_20
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #15
+ ble sgemm_kernel_L4_END
+
+ tst counterI, #8
+ ble sgemm_kernel_L4_M4_BEGIN
+
+sgemm_kernel_L4_M8_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L4_M8_32
+
+ KERNEL8x4_I // do one in the K
+ KERNEL8x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble sgemm_kernel_L4_M8_22a
+ .align 5
+
+sgemm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M8_22
+
+sgemm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble sgemm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+ KERNEL8x4_E
+
+ b sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_40:
+
+ INIT8x4
+
+sgemm_kernel_L4_M8_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L4_M8_100
+
+sgemm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+sgemm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+sgemm_kernel_L4_M8_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble sgemm_kernel_L4_END
+
+ tst counterI, #4
+ ble sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble sgemm_kernel_L4_M4_22a
+ .align 5
+
+sgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:
+
+ tst counterL, #1
+ ble sgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+sgemm_kernel_L4_M4_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+sgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+ INIT2x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+
+sgemm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+ INIT1x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+sgemm_kernel_L4_END:
+ add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+
+ subs counterJ, counterJ , #1 // j--
+ bgt sgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble sgemm_kernel_L999
+
+ tst counterJ , #2
+ ble sgemm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+ mov pA, origPA // pA = A
+
+sgemm_kernel_L2_M16_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #4 // counterI = counterI / 16
+ cmp counterI,#0
+ ble sgemm_kernel_L2_M8_BEGIN
+
+sgemm_kernel_L2_M16_20:
+
+ INIT16x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M16_40
+ .align 5
+
+sgemm_kernel_L2_M16_22:
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M16_22
+
+
+sgemm_kernel_L2_M16_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M16_100
+
+sgemm_kernel_L2_M16_42:
+
+ KERNEL16x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M16_42
+
+sgemm_kernel_L2_M16_100:
+
+ SAVE16x2
+
+sgemm_kernel_L2_M16_END:
+
+ subs counterI, counterI, #1
+ bgt sgemm_kernel_L2_M16_20
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L2_M8_BEGIN:
+ mov counterI, origM
+ tst counterI , #15
+ ble sgemm_kernel_L2_END
+
+ tst counterI, #8
+ ble sgemm_kernel_L2_M4_BEGIN
+
+sgemm_kernel_L2_M8_20:
+
+ INIT8x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M8_40
+ .align 5
+
+sgemm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M8_22
+
+
+sgemm_kernel_L2_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M8_100
+
+sgemm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M8_42
+
+sgemm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+sgemm_kernel_L2_M8_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L2_M4_BEGIN:
+ mov counterI, origM
+ tst counterI , #7
+ ble sgemm_kernel_L2_END
+
+ tst counterI, #4
+ ble sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+ INIT4x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M4_40
+ .align 5
+
+sgemm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+//------------------------------------------------------------------------------
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+ INIT2x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+ INIT1x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+sgemm_kernel_L2_END:
+
+ add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
+
+/******************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble sgemm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+ mov pA, origPA // pA = A
+
+sgemm_kernel_L1_M16_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #4 // counterI = counterI / 16
+ cmp counterI, #0
+ ble sgemm_kernel_L1_M8_BEGIN
+
+sgemm_kernel_L1_M16_20:
+
+ INIT16x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M16_40
+ .align 5
+
+sgemm_kernel_L1_M16_22:
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M16_22
+
+
+sgemm_kernel_L1_M16_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M16_100
+
+sgemm_kernel_L1_M16_42:
+
+ KERNEL16x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M16_42
+
+sgemm_kernel_L1_M16_100:
+
+ SAVE16x1
+
+sgemm_kernel_L1_M16_END:
+
+ subs counterI, counterI, #1
+ bgt sgemm_kernel_L1_M16_20
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #15
+ ble sgemm_kernel_L1_END
+
+ tst counterI, #8
+ ble sgemm_kernel_L1_M4_BEGIN
+
+sgemm_kernel_L1_M8_20:
+
+ INIT8x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M8_40
+ .align 5
+
+sgemm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M8_22
+
+
+sgemm_kernel_L1_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M8_100
+
+sgemm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M8_42
+
+sgemm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+sgemm_kernel_L1_M8_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L1_M4_BEGIN:
+ mov counterI, origM
+ tst counterI , #7
+ ble sgemm_kernel_L1_END
+
+ tst counterI, #4
+ ble sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+ INIT4x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M4_40
+ .align 5
+
+sgemm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+ INIT2x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+ INIT1x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+sgemm_kernel_L1_END:
+
+sgemm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S
new file mode 100644
index 000000000..ac690e4d4
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_8x8.S
@@ -0,0 +1,2305 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+#define temp x16
+
+#define alpha0 s10
+#define alphaV0 v10.s[0]
+#define alpha1 s11
+#define alphaV1 v11.s[0]
+#define alpha2 s14
+#define alphaV2 v14.s[0]
+#define alpha3 s15
+#define alphaV3 v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
+//v01 pA0_4, pA0_5, pA0_6, pA0_7
+//v02 pA1_0, pA1_1, pA1_2, pA1_3
+//v03 pA1_4, pA1_5, pA1_6, pA1_7
+//v04 pB0_0, pB0_1, pB0_2, pB0_3
+//v05 pB0_4, pB0_5, pB0_6, pB0_7
+//v06 pB1_0, pB1_1, pB1_2, pB1_3
+//v07 pB1_4, pB1_5, pB1_6, pB1_7
+//v08 must save
+//v09 must save
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save
+//v13 must save
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x8
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, s16
+ fmov s19, s17
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s26, s17
+ fmov s27, s18
+ fmov s28, wzr
+ fmov s29, s16
+ fmov s30, s17
+ fmov s31, s18
+.endm
+
+.macro KERNEL8x8_I
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v4.4s[0]
+ fmul v17.4s, v1.4s, v4.4s[0]
+ fmul v18.4s, v0.4s, v4.4s[1]
+ fmul v19.4s, v1.4s, v4.4s[1]
+ fmul v20.4s, v0.4s, v4.4s[2]
+ fmul v21.4s, v1.4s, v4.4s[2]
+ fmul v22.4s, v0.4s, v4.4s[3]
+ fmul v23.4s, v1.4s, v4.4s[3]
+ fmul v24.4s, v0.4s, v5.4s[0]
+ fmul v25.4s, v1.4s, v5.4s[0]
+ fmul v26.4s, v0.4s, v5.4s[1]
+ fmul v27.4s, v1.4s, v5.4s[1]
+ fmul v28.4s, v0.4s, v5.4s[2]
+ fmul v29.4s, v1.4s, v5.4s[2]
+ fmul v30.4s, v0.4s, v5.4s[3]
+ fmul v31.4s, v1.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M1
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v17.4s, v1.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v19.4s, v1.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v21.4s, v1.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v23.4s, v1.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v25.4s, v1.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v27.4s, v1.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v29.4s, v1.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v31.4s, v1.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M2
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v17.4s, v3.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v19.4s, v3.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v21.4s, v3.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v23.4s, v3.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v25.4s, v3.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v27.4s, v3.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v29.4s, v3.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v31.4s, v3.4s, v7.4s[3]
+
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x8_E
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v17.4s, v3.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v19.4s, v3.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v21.4s, v3.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v23.4s, v3.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v25.4s, v3.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v27.4s, v3.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v29.4s, v3.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v31.4s, v3.4s, v7.4s[3]
+.endm
+
+.macro KERNEL8x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v17.4s, v1.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v19.4s, v1.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v21.4s, v1.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v23.4s, v1.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v25.4s, v1.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v27.4s, v1.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v29.4s, v1.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v31.4s, v1.4s, v5.4s[3]
+.endm
+
+.macro SAVE8x8
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v2.4s, v3.4s}, [pCRow1]
+ fmla v2.4s, v18.4s, alphaV2
+ fmla v3.4s, v19.4s, alphaV3
+ st1 {v2.4s, v3.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v4.4s, v5.4s}, [pCRow2]
+ fmla v4.4s, v20.4s, alphaV0
+ fmla v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v6.4s, v7.4s}, [pCRow1]
+ fmla v6.4s, v22.4s, alphaV2
+ fmla v7.4s, v23.4s, alphaV3
+ st1 {v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow2]
+ fmla v0.4s, v24.4s, alphaV0
+ fmla v1.4s, v25.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v2.4s, v3.4s}, [pCRow1]
+ fmla v2.4s, v26.4s, alphaV2
+ fmla v3.4s, v27.4s, alphaV3
+ st1 {v2.4s, v3.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v4.4s, v5.4s}, [pCRow2]
+ fmla v4.4s, v28.4s, alphaV0
+ fmla v5.4s, v29.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow2]
+
+ ld1 {v6.4s, v7.4s}, [pCRow1]
+ fmla v6.4s, v30.4s, alphaV2
+ fmla v7.4s, v31.4s, alphaV3
+ st1 {v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x8
+ fmov s16, wzr
+ fmov s18, wzr
+ fmov s20, wzr
+ fmov s22, s16
+ fmov s24, wzr
+ fmov s26, s16
+ fmov s28, s18
+ fmov s30, s20
+.endm
+
+.macro KERNEL4x8_I
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v4.4s[0]
+ fmul v18.4s, v0.4s, v4.4s[1]
+ fmul v20.4s, v0.4s, v4.4s[2]
+ fmul v22.4s, v0.4s, v4.4s[3]
+ fmul v24.4s, v0.4s, v5.4s[0]
+ fmul v26.4s, v0.4s, v5.4s[1]
+ fmul v28.4s, v0.4s, v5.4s[2]
+ fmul v30.4s, v0.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M1
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M2
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x8_E
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+.endm
+
+.macro KERNEL4x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+.endm
+
+.macro SAVE4x8
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ st1 {v0.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v2.4s}, [pCRow1]
+ fmla v2.4s, v18.4s, alphaV2
+ st1 {v2.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v4.4s}, [pCRow2]
+ fmla v4.4s, v20.4s, alphaV0
+ st1 {v4.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v6.4s}, [pCRow1]
+ fmla v6.4s, v22.4s, alphaV2
+ st1 {v6.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v0.4s}, [pCRow2]
+ fmla v0.4s, v24.4s, alphaV0
+ st1 {v0.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v2.4s}, [pCRow1]
+ fmla v2.4s, v26.4s, alphaV2
+ st1 {v2.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v4.4s}, [pCRow2]
+ fmla v4.4s, v28.4s, alphaV0
+ st1 {v4.4s}, [pCRow2]
+
+ ld1 {v6.4s}, [pCRow1]
+ fmla v6.4s, v30.4s, alphaV2
+ st1 {v6.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+ fmov s16, wzr
+ fmov s18, wzr
+ fmov s20, wzr
+ fmov s22, s16
+ fmov s24, wzr
+ fmov s26, s16
+ fmov s28, s18
+ fmov s30, s20
+.endm
+
+.macro KERNEL2x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v4.4s[0]
+ fmla v18.2s, v0.2s, v4.4s[1]
+ fmla v20.2s, v0.2s, v4.4s[2]
+ fmla v22.2s, v0.2s, v4.4s[3]
+ fmla v24.2s, v0.2s, v5.4s[0]
+ fmla v26.2s, v0.2s, v5.4s[1]
+ fmla v28.2s, v0.2s, v5.4s[2]
+ fmla v30.2s, v0.2s, v5.4s[3]
+.endm
+
+.macro SAVE2x8
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.2s}, [pCRow0]
+ fmla v0.2s, v16.2s, alphaV0
+ st1 {v0.2s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v2.2s}, [pCRow1]
+ fmla v2.2s, v18.2s, alphaV2
+ st1 {v2.2s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v4.2s}, [pCRow2]
+ fmla v4.2s, v20.2s, alphaV0
+ st1 {v4.2s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v6.2s}, [pCRow1]
+ fmla v6.2s, v22.2s, alphaV2
+ st1 {v6.2s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v0.2s}, [pCRow2]
+ fmla v0.2s, v24.2s, alphaV0
+ st1 {v0.2s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v2.2s}, [pCRow1]
+ fmla v2.2s, v26.2s, alphaV2
+ st1 {v2.2s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v4.2s}, [pCRow2]
+ fmla v4.2s, v28.2s, alphaV0
+ st1 {v4.2s}, [pCRow2]
+
+ ld1 {v6.2s}, [pCRow1]
+ fmla v6.2s, v30.2s, alphaV2
+ st1 {v6.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+ fmov s16, wzr
+ fmov s18, wzr
+ fmov s20, wzr
+ fmov s22, s16
+ fmov s24, wzr
+ fmov s26, s16
+ fmov s28, s18
+ fmov s30, s20
+.endm
+
+.macro KERNEL1x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ldr s0, [pA]
+ add pA, pA, #4
+
+ fmla s16, s0, v4.4s[0]
+ fmla s18, s0, v4.4s[1]
+ fmla s20, s0, v4.4s[2]
+ fmla s22, s0, v4.4s[3]
+ fmla s24, s0, v5.4s[0]
+ fmla s26, s0, v5.4s[1]
+ fmla s28, s0, v5.4s[2]
+ fmla s30, s0, v5.4s[3]
+.endm
+
+.macro SAVE1x8
+ add pCRow1, pCRow0, LDC
+
+ ldr s0, [pCRow0]
+ fmla s0, s16, alphaV0
+ str s0, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ldr s2, [pCRow1]
+ fmla s2, s18, alphaV2
+ str s2, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ldr s4, [pCRow2]
+ fmla s4, s20, alphaV0
+ str s4, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ldr s6, [pCRow1]
+ fmla s6, s22, alphaV2
+ str s6, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ldr s0, [pCRow2]
+ fmla s0, s24, alphaV0
+ str s0, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ ldr s2, [pCRow1]
+ fmla s2, s26, alphaV2
+ str s2, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ldr s4, [pCRow2]
+ fmla s4, s28, alphaV0
+ str s4, [pCRow2]
+
+ ldr s6, [pCRow1]
+ fmla s6, s30, alphaV2
+ str s6, [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s28, wzr
+ fmov s29, s16
+.endm
+
+.macro KERNEL8x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v8.2s[0]
+ fmul v17.4s, v1.4s, v8.2s[0]
+ fmul v20.4s, v0.4s, v8.2s[1]
+ fmul v21.4s, v1.4s, v8.2s[1]
+ fmul v24.4s, v0.4s, v9.2s[0]
+ fmul v25.4s, v1.4s, v9.2s[0]
+ fmul v28.4s, v0.4s, v9.2s[1]
+ fmul v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0
+ fmla v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow2]
+ fmla v0.4s, v24.4s, alphaV0
+ fmla v1.4s, v25.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow2]
+
+ ld1 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v28.4s, alphaV0
+ fmla v5.4s, v29.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x4
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+ fmov s24, s17
+ fmov s25, s16
+ fmov s28, s17
+ fmov s29, s16
+.endm
+
+.macro KERNEL4x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.2s, v0.2s, v8.2s[0]
+ fmul v29.2s, v1.2s, v9.2s[1]
+
+ fmul v20.2s, v0.2s, v8.2s[1]
+ fmul v25.2s, v1.2s, v9.2s[0]
+
+ fmul v24.2s, v0.2s, v9.2s[0]
+ fmul v21.2s, v1.2s, v8.2s[1]
+
+ fmul v28.2s, v0.2s, v9.2s[1]
+ fmul v17.2s, v1.2s, v8.2s[0]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.2s, v5.2s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ ld1 {v4.2s, v5.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ ld1 {v0.2s, v1.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+ ld1 {v8.2s, v9.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ fmla v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+ ld1 {v12.2s, v13.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV2
+ fmla v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ ld1 {v8.2s, v9.2s}, [pCRow2]
+ fmla v8.2s, v24.2s, alphaV0
+ fmla v9.2s, v25.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+ ld1 {v12.2s, v13.2s}, [pCRow1]
+ fmla v12.2s, v28.2s, alphaV2
+ fmla v13.2s, v29.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov s16, wzr
+ fmov s20, s16
+ fmov s24, s20
+ fmov s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+ ld1 {v8.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+ ld1 {v12.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ ld1 {v8.2s}, [pCRow2]
+ fmla v8.2s, v24.2s, alphaV2
+ st1 {v8.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+ ld1 {v12.2s}, [pCRow1]
+ fmla v12.2s, v28.2s, alphaV3
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr s0, [pA]
+ add pA, pA, #4
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
+ fmla v16.2s, v8.2s, v0.s[0]
+ fmla v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+ ld1 {v8.s}[0], [pCRow0]
+ ld1 {v8.s}[1], [pCRow1]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+ ld1 {v12.s}[0], [pCRow2]
+ ld1 {v12.s}[1], [pCRow1]
+ fmla v12.2s, v20.2s, alphaV1
+ st1 {v12.s}[0], [pCRow2]
+ st1 {v12.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+ add pCRow1, pCRow0, LDC
+
+ ld1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ ld1 {v4.4s, v5.4s}, [pCRow1]
+ fmla v4.4s, v20.4s, alphaV0
+ fmla v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+ ld1 {v8.2s, v9.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ fmla v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+ ld1 {v12.2s, v13.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV2
+ fmla v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+ ld1 {v8.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+ ld1 {v12.2s}, [pCRow1]
+ fmla v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2s} , [pB]
+ add pB , pB, #8
+
+ ldr s0 , [pA]
+ add pA, pA, #4
+
+ fmla v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+ ld1 {v8.s}[0], [pCRow0]
+ ld1 {v8.s}[1], [pCRow1]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+ ld1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ fmla v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov s16, wzr
+ fmov s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+ ld1 {v8.2s, v9.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ fmla v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s}, [pA]
+ add pA , pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+ ld1 {v8.2s}, [pCRow0]
+ fmla v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ldr s0, [pA]
+ add pA , pA, #4
+
+ fmadd s16, s0, s8, s16
+.endm
+
+.macro SAVE1x1
+ ldr s8, [pCRow0]
+ fmla s8, s16, alphaV0
+ str s8, [pCRow0]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+sgemm_kernel_begin:
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0, s0
+ fmov alpha1, s0
+ fmov alpha2, s0
+ fmov alpha3, s0
+
+ lsl LDC, LDC, #2 // ldc = ldc * 4
+
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #3 // J = J / 8
+ cmp counterJ, #0
+ ble sgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L8_BEGIN:
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #3
+
+ mov pA, origPA // pA = start of A array
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble sgemm_kernel_L8_M4_BEGIN
+
+sgemm_kernel_L8_M8_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L8_M8_32
+
+ KERNEL8x8_I // do one in the K
+ KERNEL8x8_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble sgemm_kernel_L8_M8_22a
+ .align 5
+
+sgemm_kernel_L8_M8_22:
+
+ KERNEL8x8_M1
+ KERNEL8x8_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L8_M8_22
+
+sgemm_kernel_L8_M8_22a:
+
+ KERNEL8x8_M1
+ KERNEL8x8_E
+
+ b sgemm_kernel_L8_M8_44
+
+sgemm_kernel_L8_M8_32:
+
+ tst counterL, #1
+ ble sgemm_kernel_L8_M8_40
+
+ KERNEL8x8_I
+ KERNEL8x8_E
+
+ b sgemm_kernel_L8_M8_44
+
+sgemm_kernel_L8_M8_40:
+
+ INIT8x8
+
+sgemm_kernel_L8_M8_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L8_M8_100
+
+sgemm_kernel_L8_M8_46:
+
+ KERNEL8x8_SUB
+
+sgemm_kernel_L8_M8_100:
+
+ SAVE8x8
+
+sgemm_kernel_L8_M8_END:
+ subs counterI, counterI, #1
+ bne sgemm_kernel_L8_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble sgemm_kernel_L8_END
+
+ tst counterI, #4
+ ble sgemm_kernel_L8_M2_BEGIN
+
+sgemm_kernel_L8_M4_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L8_M4_32
+
+ KERNEL4x8_I // do one in the K
+ KERNEL4x8_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble sgemm_kernel_L8_M4_22a
+ .align 5
+
+sgemm_kernel_L8_M4_22:
+
+ KERNEL4x8_M1
+ KERNEL4x8_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L8_M4_22
+
+sgemm_kernel_L8_M4_22a:
+
+ KERNEL4x8_M1
+ KERNEL4x8_E
+
+ b sgemm_kernel_L8_M4_44
+
+sgemm_kernel_L8_M4_32:
+
+ tst counterL, #1
+ ble sgemm_kernel_L8_M4_40
+
+ KERNEL4x8_I
+ KERNEL4x8_E
+
+ b sgemm_kernel_L8_M4_44
+
+sgemm_kernel_L8_M4_40:
+
+ INIT4x8
+
+sgemm_kernel_L8_M4_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L8_M4_100
+
+sgemm_kernel_L8_M4_46:
+
+ KERNEL4x8_SUB
+
+sgemm_kernel_L8_M4_100:
+
+ SAVE4x8
+
+sgemm_kernel_L8_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L8_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L8_M1_BEGIN
+
+sgemm_kernel_L8_M2_20:
+
+ INIT2x8
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L8_M2_40
+
+sgemm_kernel_L8_M2_22:
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L8_M2_22
+
+
+sgemm_kernel_L8_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L8_M2_100
+
+sgemm_kernel_L8_M2_42:
+
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L8_M2_42
+
+sgemm_kernel_L8_M2_100:
+
+ SAVE2x8
+
+sgemm_kernel_L8_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L8_END
+
+sgemm_kernel_L8_M1_20:
+
+ INIT1x8
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L8_M1_40
+
+sgemm_kernel_L8_M1_22:
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L8_M1_22
+
+
+sgemm_kernel_L8_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L8_M1_100
+
+sgemm_kernel_L8_M1_42:
+
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L8_M1_42
+
+sgemm_kernel_L8_M1_100:
+
+ SAVE1x8
+
+sgemm_kernel_L8_END:
+ lsl temp, origK, #5 // B = B + K * 4 * 8
+ add origPB, origPB, temp
+
+ subs counterJ, counterJ , #1 // j--
+ bgt sgemm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L4_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #7
+ ble sgemm_kernel_L999
+
+ tst counterJ , #4
+ ble sgemm_kernel_L2_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #2
+
+ mov pA, origPA // pA = A
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble sgemm_kernel_L4_M4_BEGIN
+
+sgemm_kernel_L4_M8_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L4_M8_32
+
+ KERNEL8x4_I // do one in the K
+ KERNEL8x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble sgemm_kernel_L4_M8_22a
+ .align 5
+
+sgemm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M8_22
+
+sgemm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble sgemm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+ KERNEL8x4_E
+
+ b sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_40:
+
+ INIT8x4
+
+sgemm_kernel_L4_M8_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L4_M8_100
+
+sgemm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+sgemm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+sgemm_kernel_L4_M8_END:
+ subs counterI, counterI, #1
+ bne sgemm_kernel_L4_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble sgemm_kernel_L4_END
+
+ tst counterI, #4
+ ble sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+ mov pB, origPB
+
+ asr counterL , origK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt sgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble sgemm_kernel_L4_M4_22a
+ .align 5
+
+sgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:
+
+ tst counterL, #1
+ ble sgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+sgemm_kernel_L4_M4_44:
+
+ ands counterL , origK, #1
+ ble sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+sgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+ INIT2x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+ INIT1x4
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+sgemm_kernel_L4_END:
+ add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble sgemm_kernel_L999
+
+ tst counterJ , #2
+ ble sgemm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+ mov pA, origPA // pA = A
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI,#0
+ ble sgemm_kernel_L2_M4_BEGIN
+
+sgemm_kernel_L2_M8_20:
+
+ INIT8x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M8_40
+ .align 5
+
+sgemm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M8_22
+
+
+sgemm_kernel_L2_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M8_100
+
+sgemm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M8_42
+
+sgemm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+sgemm_kernel_L2_M8_END:
+
+ subs counterI, counterI, #1
+ bgt sgemm_kernel_L2_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble sgemm_kernel_L2_END
+
+ tst counterI, #4
+ ble sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+ INIT4x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M4_40
+ .align 5
+
+sgemm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+ INIT2x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+ INIT1x2
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+sgemm_kernel_L2_END:
+
+ add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble sgemm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+ mov pA, origPA // pA = A
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3
+ cmp counterI, #0
+ ble sgemm_kernel_L1_M4_BEGIN
+
+sgemm_kernel_L1_M8_20:
+
+ INIT8x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M8_40
+ .align 5
+
+sgemm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M8_22
+
+
+sgemm_kernel_L1_M8_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M8_100
+
+sgemm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M8_42
+
+sgemm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+sgemm_kernel_L1_M8_END:
+
+ subs counterI, counterI, #1
+ bgt sgemm_kernel_L1_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble sgemm_kernel_L1_END
+
+ tst counterI, #4
+ ble sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+ INIT4x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M4_40
+ .align 5
+
+sgemm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble sgemm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+ INIT2x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+ INIT1x1
+
+ mov pB, origPB
+
+ asr counterL , origK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+ ands counterL , origK, #7 // counterL = counterL % 8
+ ble sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+sgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S
new file mode 100755
index 000000000..b99760a03
--- /dev/null
+++ b/kernel/arm64/strmm_kernel_16x4.S
@@ -0,0 +1,2431 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+#define temp x16
+#define tempOffset x17
+#define tempK x18
+
+#define alpha0 s10
+#define alphaV0 v10.s[0]
+#define alpha1 s11
+#define alphaV1 v11.s[0]
+#define alpha2 s14
+#define alphaV2 v14.s[0]
+#define alpha3 s15
+#define alphaV3 v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
+//v01 pA0_04, pA0_05, pA0_06, pA0_07
+//v02 pA0_08, pA0_09, pA0_10, pA0_11
+//v03 pA0_12, pA0_13, pA0_14, pA0_15
+//v04 pA1_00, pA1_01, pA1_02, pA1_03
+//v05 pA1_04, pA1_05, pA1_06, pA1_07
+//v06 pA1_08, pA1_09, pA1_10, pA1_11
+//v07 pA1_12, pA1_13, pA1_14, pA1_15
+//v08 must save pB00, pB01
+//v09 must save pB02, pB03
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB10, pB11
+//v13 must save pB12, pB13
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT16x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, s16
+ fmov s19, s17
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s26, s17
+ fmov s27, s18
+ fmov s28, wzr
+ fmov s29, s16
+ fmov s30, s17
+ fmov s31, s18
+.endm
+
+.macro KERNEL16x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v8.2s[0]
+ fmul v17.4s, v1.4s, v8.2s[0]
+ fmul v18.4s, v2.4s, v8.2s[0]
+ fmul v19.4s, v3.4s, v8.2s[0]
+
+ fmul v20.4s, v0.4s, v8.2s[1]
+ fmul v21.4s, v1.4s, v8.2s[1]
+ fmul v22.4s, v2.4s, v8.2s[1]
+ fmul v23.4s, v3.4s, v8.2s[1]
+
+ fmul v24.4s, v0.4s, v9.2s[0]
+ fmul v25.4s, v1.4s, v9.2s[0]
+ fmul v26.4s, v2.4s, v9.2s[0]
+ fmul v27.4s, v3.4s, v9.2s[0]
+
+ fmul v28.4s, v0.4s, v9.2s[1]
+ fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v30.4s, v2.4s, v9.2s[1]
+ fmul v31.4s, v3.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v6.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v7.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M1
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v22.4s, v2.4s, v8.2s[1]
+ fmla v23.4s, v3.4s, v8.2s[1]
+
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v26.4s, v2.4s, v9.2s[0]
+ fmla v27.4s, v3.4s, v9.2s[0]
+
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v30.4s, v2.4s, v9.2s[1]
+ fmla v31.4s, v3.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v6.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v7.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M2
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v18.4s, v6.4s, v12.2s[0]
+ fmla v19.4s, v7.4s, v12.2s[0]
+
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v22.4s, v6.4s, v12.2s[1]
+ fmla v23.4s, v7.4s, v12.2s[1]
+
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v26.4s, v6.4s, v13.2s[0]
+ fmla v27.4s, v7.4s, v13.2s[0]
+
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v30.4s, v6.4s, v13.2s[1]
+ fmla v31.4s, v7.4s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL16x4_E
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v18.4s, v6.4s, v12.2s[0]
+ fmla v19.4s, v7.4s, v12.2s[0]
+
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v22.4s, v6.4s, v12.2s[1]
+ fmla v23.4s, v7.4s, v12.2s[1]
+
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v26.4s, v6.4s, v13.2s[0]
+ fmla v27.4s, v7.4s, v13.2s[0]
+
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v30.4s, v6.4s, v13.2s[1]
+ fmla v31.4s, v7.4s, v13.2s[1]
+.endm
+
+.macro KERNEL16x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v22.4s, v2.4s, v8.2s[1]
+ fmla v23.4s, v3.4s, v8.2s[1]
+
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v26.4s, v2.4s, v9.2s[0]
+ fmla v27.4s, v3.4s, v9.2s[0]
+
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v30.4s, v2.4s, v9.2s[1]
+ fmla v31.4s, v3.4s, v9.2s[1]
+.endm
+
+.macro SAVE16x4
+ add pCRow1, pCRow0, LDC
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ fmul v2.4s, v18.4s, alphaV2
+ fmul v3.4s, v19.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v4.4s, v20.4s, alphaV0
+ fmul v5.4s, v21.4s, alphaV1
+ fmul v6.4s, v22.4s, alphaV2
+ fmul v7.4s, v23.4s, alphaV3
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v0.4s, v24.4s, alphaV0
+ fmul v1.4s, v25.4s, alphaV1
+ fmul v2.4s, v26.4s, alphaV2
+ fmul v3.4s, v27.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+
+ fmul v4.4s, v28.4s, alphaV0
+ fmul v5.4s, v29.4s, alphaV1
+ fmul v6.4s, v30.4s, alphaV2
+ fmul v7.4s, v31.4s, alphaV3
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s28, wzr
+ fmov s29, s16
+.endm
+
+.macro KERNEL8x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v8.2s[0]
+ fmul v17.4s, v1.4s, v8.2s[0]
+ fmul v20.4s, v0.4s, v8.2s[1]
+ fmul v21.4s, v1.4s, v8.2s[1]
+ fmul v24.4s, v0.4s, v9.2s[0]
+ fmul v25.4s, v1.4s, v9.2s[0]
+ fmul v28.4s, v0.4s, v9.2s[1]
+ fmul v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+ add pCRow1, pCRow0, LDC
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v4.4s, v20.4s, alphaV0
+ fmul v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v0.4s, v24.4s, alphaV0
+ fmul v1.4s, v25.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow2]
+
+ fmul v4.4s, v28.4s, alphaV0
+ fmul v5.4s, v29.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+ fmov s24, s17
+ fmov s25, s16
+ fmov s28, s17
+ fmov s29, s16
+.endm
+
+.macro KERNEL4x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.2s, v0.2s, v8.2s[0]
+ fmul v29.2s, v1.2s, v9.2s[1]
+
+ fmul v20.2s, v0.2s, v8.2s[1]
+ fmul v25.2s, v1.2s, v9.2s[0]
+
+ fmul v24.2s, v0.2s, v9.2s[0]
+ fmul v21.2s, v1.2s, v8.2s[1]
+
+ fmul v28.2s, v0.2s, v9.2s[1]
+ fmul v17.2s, v1.2s, v8.2s[0]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.2s, v5.2s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ ld1 {v4.2s, v5.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ ld1 {v0.2s, v1.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+
+ fmul v8.2s, v16.2s, alphaV0
+ fmul v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2s, v20.2s, alphaV2
+ fmul v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v8.2s, v24.2s, alphaV0
+ fmul v9.2s, v25.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2s, v28.2s, alphaV2
+ fmul v13.2s, v29.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov s16, wzr
+ fmov s20, s16
+ fmov s24, s20
+ fmov s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+ fmul v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ fmul v8.2s, v24.2s, alphaV2
+ st1 {v8.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+ fmul v12.2s, v28.2s, alphaV3
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr s0, [pA]
+ add pA, pA, #4
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
+ fmla v16.2s, v8.2s, v0.s[0]
+ fmla v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2s, v20.2s, alphaV1
+ st1 {v12.s}[0], [pCRow2]
+ st1 {v12.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x2
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, wzr
+ fmov s23, s16
+.endm
+
+.macro KERNEL16x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v22.4s, v2.4s, v8.2s[1]
+ fmla v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE16x2
+ add pCRow1, pCRow0, LDC
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ fmul v2.4s, v18.4s, alphaV2
+ fmul v3.4s, v19.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+ fmul v4.4s, v20.4s, alphaV0
+ fmul v5.4s, v21.4s, alphaV1
+ fmul v6.4s, v22.4s, alphaV2
+ fmul v7.4s, v23.4s, alphaV3
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+ add pCRow1, pCRow0, LDC
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v4.4s, v20.4s, alphaV0
+ fmul v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+
+ fmul v8.2s, v16.2s, alphaV0
+ fmul v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2s, v20.2s, alphaV2
+ fmul v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+
+ fmul v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2s} , [pB]
+ add pB , pB, #8
+
+ ldr s0 , [pA]
+ add pA, pA, #4
+
+ fmla v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x1
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, wzr
+ fmov s19, s16
+.endm
+
+.macro KERNEL16x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v18.4s, v2.4s, v8.2s[0]
+ fmla v19.4s, v3.4s, v8.2s[0]
+.endm
+
+.macro SAVE16x1
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ fmul v2.4s, v18.4s, alphaV2
+ fmul v3.4s, v19.4s, alphaV3
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+ add pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov s16, wzr
+ fmov s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+
+ fmul v8.2s, v16.2s, alphaV0
+ fmul v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s}, [pA]
+ add pA , pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ldr s0, [pA]
+ add pA , pA, #4
+
+ fmadd s16, s0, s8, s16
+.endm
+
+.macro SAVE1x1
+ fmul s8, s16, alpha0
+ str s8, [pCRow0]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+strmm_kernel_begin:
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0, s0
+ fmov alpha1, s0
+ fmov alpha2, s0
+ fmov alpha3, s0
+
+ lsl LDC, LDC, #2 // ldc = ldc * 4
+
+#if !defined(LEFT)
+ neg tempOffset, offset
+#endif
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #2 // J = J / 4
+ cmp counterJ, #0
+ ble strmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+strmm_kernel_L4_BEGIN:
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = start of A array
+
+strmm_kernel_L4_M16_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #4 // counterI = counterI / 16
+ cmp counterI, #0
+ ble strmm_kernel_L4_M8_BEGIN
+
+strmm_kernel_L4_M16_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #16
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt strmm_kernel_L4_M16_32
+
+ KERNEL16x4_I // do one in the K
+ KERNEL16x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble strmm_kernel_L4_M16_22a
+ .align 5
+
+strmm_kernel_L4_M16_22:
+
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M16_22
+
+strmm_kernel_L4_M16_22a:
+
+ KERNEL16x4_M1
+ KERNEL16x4_E
+
+ b strmm_kernel_L4_M16_44
+
+strmm_kernel_L4_M16_32:
+
+ tst counterL, #1
+ ble strmm_kernel_L4_M16_40
+
+ KERNEL16x4_I
+ KERNEL16x4_E
+
+ b strmm_kernel_L4_M16_44
+
+strmm_kernel_L4_M16_40:
+
+ INIT16x4
+
+strmm_kernel_L4_M16_44:
+
+ ands counterL , tempK, #1
+ ble strmm_kernel_L4_M16_100
+
+strmm_kernel_L4_M16_46:
+
+ KERNEL16x4_SUB
+
+strmm_kernel_L4_M16_100:
+
+ SAVE16x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #16
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #16
+#endif
+
+strmm_kernel_L4_M16_END:
+ subs counterI, counterI, #1
+ bne strmm_kernel_L4_M16_20
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #15
+ ble strmm_kernel_L4_END
+
+ tst counterI, #8
+ ble strmm_kernel_L4_M4_BEGIN
+
+strmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt strmm_kernel_L4_M8_32
+
+ KERNEL8x4_I // do one in the K
+ KERNEL8x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble strmm_kernel_L4_M8_22a
+ .align 5
+
+strmm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M8_22
+
+strmm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble strmm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+ KERNEL8x4_E
+
+ b strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_40:
+
+ INIT8x4
+
+strmm_kernel_L4_M8_44:
+
+ ands counterL , tempK, #1
+ ble strmm_kernel_L4_M8_100
+
+strmm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+strmm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L4_M8_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble strmm_kernel_L4_END
+
+ tst counterI, #4
+ ble strmm_kernel_L4_M2_BEGIN
+
+strmm_kernel_L4_M4_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt strmm_kernel_L4_M4_32
+
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble strmm_kernel_L4_M4_22a
+ .align 5
+
+strmm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M4_22
+
+strmm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_32:
+
+ tst counterL, #1
+ ble strmm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_E
+
+ b strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_40:
+
+ INIT4x4
+
+strmm_kernel_L4_M4_44:
+
+ ands counterL , tempK, #1
+ ble strmm_kernel_L4_M4_100
+
+strmm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+strmm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L4_M4_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble strmm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble strmm_kernel_L4_M1_BEGIN
+
+strmm_kernel_L4_M2_20:
+
+ INIT2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L4_M2_40
+
+strmm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M2_22
+
+
+strmm_kernel_L4_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L4_M2_100
+
+strmm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M2_42
+
+strmm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L4_M2_END:
+
+
+strmm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble strmm_kernel_L4_END
+
+strmm_kernel_L4_M1_20:
+
+ INIT1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #2
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L4_M1_40
+
+strmm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M1_22
+
+
+strmm_kernel_L4_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L4_M1_100
+
+strmm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M1_42
+
+strmm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #2
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L4_END:
+ add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+ subs counterJ, counterJ , #1 // j--
+ bgt strmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+strmm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble strmm_kernel_L999
+
+ tst counterJ , #2
+ ble strmm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+strmm_kernel_L2_M16_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #4 // counterI = counterI / 16
+ cmp counterI,#0
+ ble strmm_kernel_L2_M8_BEGIN
+
+strmm_kernel_L2_M16_20:
+
+ INIT16x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #16
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble strmm_kernel_L2_M16_40
+ .align 5
+
+strmm_kernel_L2_M16_22:
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+ KERNEL16x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M16_22
+
+
+strmm_kernel_L2_M16_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M16_100
+
+strmm_kernel_L2_M16_42:
+
+ KERNEL16x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M16_42
+
+strmm_kernel_L2_M16_100:
+
+ SAVE16x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #16
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #16
+#endif
+
+strmm_kernel_L2_M16_END:
+
+ subs counterI, counterI, #1
+ bgt strmm_kernel_L2_M16_20
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L2_M8_BEGIN:
+ mov counterI, origM
+ tst counterI , #15
+ ble strmm_kernel_L2_END
+
+ tst counterI, #8
+ ble strmm_kernel_L2_M4_BEGIN
+
+strmm_kernel_L2_M8_20:
+
+ INIT8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #2
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble strmm_kernel_L2_M8_40
+ .align 5
+
+strmm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M8_22
+
+
+strmm_kernel_L2_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M8_100
+
+strmm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M8_42
+
+strmm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L2_M8_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L2_M4_BEGIN:
+ mov counterI, origM
+ tst counterI , #7
+ ble strmm_kernel_L2_END
+
+ tst counterI, #4
+ ble strmm_kernel_L2_M2_BEGIN
+
+strmm_kernel_L2_M4_20:
+
+ INIT4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble strmm_kernel_L2_M4_40
+ .align 5
+
+strmm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M4_22
+
+
+strmm_kernel_L2_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M4_100
+
+strmm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M4_42
+
+strmm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L2_M4_END:
+
+//------------------------------------------------------------------------------
+
+
+strmm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble strmm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble strmm_kernel_L2_M1_BEGIN
+
+strmm_kernel_L2_M2_20:
+
+ INIT2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble strmm_kernel_L2_M2_40
+
+strmm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M2_22
+
+
+strmm_kernel_L2_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M2_100
+
+strmm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M2_42
+
+strmm_kernel_L2_M2_100:
+
+ SAVE2x2
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+strmm_kernel_L2_M2_END:
+
+
+strmm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble strmm_kernel_L2_END
+
+strmm_kernel_L2_M1_20:
+
+ INIT1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #2
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble strmm_kernel_L2_M1_40
+
+strmm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M1_22
+
+
+strmm_kernel_L2_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M1_100
+
+strmm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M1_42
+
+strmm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #2
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L2_END:
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+ add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
+
+/******************************************************************************/
+
+strmm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble strmm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+strmm_kernel_L1_M16_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #4 // counterI = counterI / 16
+ cmp counterI, #0
+ ble strmm_kernel_L1_M8_BEGIN
+
+strmm_kernel_L1_M16_20:
+
+ INIT16x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #6
+ add pA, pA, temp
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #16
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M16_40
+ .align 5
+
+strmm_kernel_L1_M16_22:
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+ KERNEL16x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M16_22
+
+
+strmm_kernel_L1_M16_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M16_100
+
+strmm_kernel_L1_M16_42:
+
+ KERNEL16x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M16_42
+
+strmm_kernel_L1_M16_100:
+
+ SAVE16x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #16
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #6
+ add pA, pA, temp
+ lsl temp, tempK, #2
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #16
+#endif
+
+strmm_kernel_L1_M16_END:
+
+ subs counterI, counterI, #1
+ bgt strmm_kernel_L1_M16_20
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #15
+ ble strmm_kernel_L1_END
+
+ tst counterI, #8
+ ble strmm_kernel_L1_M4_BEGIN
+
+strmm_kernel_L1_M8_20:
+
+ INIT8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #1
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M8_40
+ .align 5
+
+strmm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M8_22
+
+
+strmm_kernel_L1_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M8_100
+
+strmm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M8_42
+
+strmm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #2
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L1_M8_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L1_M4_BEGIN:
+ mov counterI, origM
+ tst counterI , #7
+ ble strmm_kernel_L1_END
+
+ tst counterI, #4
+ ble strmm_kernel_L1_M2_BEGIN
+
+strmm_kernel_L1_M4_20:
+
+ INIT4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M4_40
+ .align 5
+
+strmm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M4_22
+
+
+strmm_kernel_L1_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M4_100
+
+strmm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M4_42
+
+strmm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #2
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L1_M4_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble strmm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble strmm_kernel_L1_M1_BEGIN
+
+strmm_kernel_L1_M2_20:
+
+ INIT2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M2_40
+
+strmm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M2_22
+
+
+strmm_kernel_L1_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M2_100
+
+strmm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M2_42
+
+strmm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #2
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L1_M2_END:
+
+
+strmm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble strmm_kernel_L1_END
+
+strmm_kernel_L1_M1_20:
+
+ INIT1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+ lsl temp, tempOffset, #2
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M1_40
+
+strmm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M1_22
+
+
+strmm_kernel_L1_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M1_100
+
+strmm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M1_42
+
+strmm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+strmm_kernel_L1_END:
+
+strmm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S
new file mode 100755
index 000000000..98b912934
--- /dev/null
+++ b/kernel/arm64/strmm_kernel_8x8.S
@@ -0,0 +1,2795 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define counterJ x10
+#define pB x11
+#define pCRow0 x12
+#define pCRow1 x13
+#define pCRow2 x14
+#define pA x15
+#define temp x16
+#define tempOffset x17
+#define tempK x18
+
+#define alpha0 s10
+#define alphaV0 v10.s[0]
+#define alpha1 s11
+#define alphaV1 v11.s[0]
+#define alpha2 s14
+#define alphaV2 v14.s[0]
+#define alpha3 s15
+#define alphaV3 v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
+//v01 pA0_4, pA0_5, pA0_6, pA0_7
+//v02 pA1_0, pA1_1, pA1_2, pA1_3
+//v03 pA1_4, pA1_5, pA1_6, pA1_7
+//v04 pB0_0, pB0_1, pB0_2, pB0_3
+//v05 pB0_4, pB0_5, pB0_6, pB0_7
+//v06 pB1_0, pB1_1, pB1_2, pB1_3
+//v07 pB1_4, pB1_5, pB1_6, pB1_7
+//v08 must save
+//v09 must save
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save
+//v13 must save
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x8
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s18, s16
+ fmov s19, s17
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s22, s17
+ fmov s23, s18
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s26, s17
+ fmov s27, s18
+ fmov s28, wzr
+ fmov s29, s16
+ fmov s30, s17
+ fmov s31, s18
+.endm
+
+.macro KERNEL8x8_I
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v4.4s[0]
+ fmul v17.4s, v1.4s, v4.4s[0]
+ fmul v18.4s, v0.4s, v4.4s[1]
+ fmul v19.4s, v1.4s, v4.4s[1]
+ fmul v20.4s, v0.4s, v4.4s[2]
+ fmul v21.4s, v1.4s, v4.4s[2]
+ fmul v22.4s, v0.4s, v4.4s[3]
+ fmul v23.4s, v1.4s, v4.4s[3]
+ fmul v24.4s, v0.4s, v5.4s[0]
+ fmul v25.4s, v1.4s, v5.4s[0]
+ fmul v26.4s, v0.4s, v5.4s[1]
+ fmul v27.4s, v1.4s, v5.4s[1]
+ fmul v28.4s, v0.4s, v5.4s[2]
+ fmul v29.4s, v1.4s, v5.4s[2]
+ fmul v30.4s, v0.4s, v5.4s[3]
+ fmul v31.4s, v1.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M1
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v17.4s, v1.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v19.4s, v1.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v21.4s, v1.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v23.4s, v1.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v25.4s, v1.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v27.4s, v1.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v29.4s, v1.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v31.4s, v1.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v3.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M2
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v17.4s, v3.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v19.4s, v3.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v21.4s, v3.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v23.4s, v3.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v25.4s, v3.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v27.4s, v3.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v29.4s, v3.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v31.4s, v3.4s, v7.4s[3]
+
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x8_E
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v17.4s, v3.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v19.4s, v3.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v21.4s, v3.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v23.4s, v3.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v25.4s, v3.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v27.4s, v3.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v29.4s, v3.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v31.4s, v3.4s, v7.4s[3]
+.endm
+
+.macro KERNEL8x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v17.4s, v1.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v19.4s, v1.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v21.4s, v1.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v23.4s, v1.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v25.4s, v1.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v27.4s, v1.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v29.4s, v1.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v31.4s, v1.4s, v5.4s[3]
+.endm
+
+.macro SAVE8x8
+ add pCRow1, pCRow0, LDC
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v2.4s, v18.4s, alphaV2
+ fmul v3.4s, v19.4s, alphaV3
+ st1 {v2.4s, v3.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v4.4s, v20.4s, alphaV0
+ fmul v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v6.4s, v22.4s, alphaV2
+ fmul v7.4s, v23.4s, alphaV3
+ st1 {v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v0.4s, v24.4s, alphaV0
+ fmul v1.4s, v25.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v2.4s, v26.4s, alphaV2
+ fmul v3.4s, v27.4s, alphaV3
+ st1 {v2.4s, v3.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v4.4s, v28.4s, alphaV0
+ fmul v5.4s, v29.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow2]
+
+ fmul v6.4s, v30.4s, alphaV2
+ fmul v7.4s, v31.4s, alphaV3
+ st1 {v6.4s, v7.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x8
+ fmov s16, wzr
+ fmov s18, wzr
+ fmov s20, wzr
+ fmov s22, s16
+ fmov s24, wzr
+ fmov s26, s16
+ fmov s28, s18
+ fmov s30, s20
+.endm
+
+.macro KERNEL4x8_I
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v4.4s[0]
+ fmul v18.4s, v0.4s, v4.4s[1]
+ fmul v20.4s, v0.4s, v4.4s[2]
+ fmul v22.4s, v0.4s, v4.4s[3]
+ fmul v24.4s, v0.4s, v5.4s[0]
+ fmul v26.4s, v0.4s, v5.4s[1]
+ fmul v28.4s, v0.4s, v5.4s[2]
+ fmul v30.4s, v0.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M1
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+
+ ld1 {v6.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v7.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v2.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M2
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x8_E
+ fmla v16.4s, v2.4s, v6.4s[0]
+ fmla v18.4s, v2.4s, v6.4s[1]
+ fmla v20.4s, v2.4s, v6.4s[2]
+ fmla v22.4s, v2.4s, v6.4s[3]
+ fmla v24.4s, v2.4s, v7.4s[0]
+ fmla v26.4s, v2.4s, v7.4s[1]
+ fmla v28.4s, v2.4s, v7.4s[2]
+ fmla v30.4s, v2.4s, v7.4s[3]
+.endm
+
+.macro KERNEL4x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v4.4s[0]
+ fmla v18.4s, v0.4s, v4.4s[1]
+ fmla v20.4s, v0.4s, v4.4s[2]
+ fmla v22.4s, v0.4s, v4.4s[3]
+ fmla v24.4s, v0.4s, v5.4s[0]
+ fmla v26.4s, v0.4s, v5.4s[1]
+ fmla v28.4s, v0.4s, v5.4s[2]
+ fmla v30.4s, v0.4s, v5.4s[3]
+.endm
+
+.macro SAVE4x8
+ add pCRow1, pCRow0, LDC
+
+
+ fmul v0.4s, v16.4s, alphaV0
+ st1 {v0.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v2.4s, v18.4s, alphaV2
+ st1 {v2.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v4.4s, v20.4s, alphaV0
+ st1 {v4.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v6.4s, v22.4s, alphaV2
+ st1 {v6.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v0.4s, v24.4s, alphaV0
+ st1 {v0.4s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v2.4s, v26.4s, alphaV2
+ st1 {v2.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v4.4s, v28.4s, alphaV0
+ st1 {v4.4s}, [pCRow2]
+
+
+ fmul v6.4s, v30.4s, alphaV2
+ st1 {v6.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+ fmov s16, wzr
+ fmov s18, wzr
+ fmov s20, wzr
+ fmov s22, s16
+ fmov s24, wzr
+ fmov s26, s16
+ fmov s28, s18
+ fmov s30, s20
+.endm
+
+.macro KERNEL2x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v4.4s[0]
+ fmla v18.2s, v0.2s, v4.4s[1]
+ fmla v20.2s, v0.2s, v4.4s[2]
+ fmla v22.2s, v0.2s, v4.4s[3]
+ fmla v24.2s, v0.2s, v5.4s[0]
+ fmla v26.2s, v0.2s, v5.4s[1]
+ fmla v28.2s, v0.2s, v5.4s[2]
+ fmla v30.2s, v0.2s, v5.4s[3]
+.endm
+
+.macro SAVE2x8
+ add pCRow1, pCRow0, LDC
+
+
+ fmul v0.2s, v16.2s, alphaV0
+ st1 {v0.2s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v2.2s, v18.2s, alphaV2
+ st1 {v2.2s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v4.2s, v20.2s, alphaV0
+ st1 {v4.2s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v6.2s, v22.2s, alphaV2
+ st1 {v6.2s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v0.2s, v24.2s, alphaV0
+ st1 {v0.2s}, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v2.2s, v26.2s, alphaV2
+ st1 {v2.2s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v4.2s, v28.2s, alphaV0
+ st1 {v4.2s}, [pCRow2]
+
+
+ fmul v6.2s, v30.2s, alphaV2
+ st1 {v6.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+ fmov s16, wzr
+ fmov s18, wzr
+ fmov s20, wzr
+ fmov s22, s16
+ fmov s24, wzr
+ fmov s26, s16
+ fmov s28, s18
+ fmov s30, s20
+.endm
+
+.macro KERNEL1x8_SUB
+ ld1 {v4.4s}, [pB]
+ add pB, pB, #16
+ ld1 {v5.4s}, [pB]
+ add pB, pB, #16
+ ldr s0, [pA]
+ add pA, pA, #4
+
+ fmla s16, s0, v4.4s[0]
+ fmla s18, s0, v4.4s[1]
+ fmla s20, s0, v4.4s[2]
+ fmla s22, s0, v4.4s[3]
+ fmla s24, s0, v5.4s[0]
+ fmla s26, s0, v5.4s[1]
+ fmla s28, s0, v5.4s[2]
+ fmla s30, s0, v5.4s[3]
+.endm
+
+.macro SAVE1x8
+ add pCRow1, pCRow0, LDC
+
+
+ fmul s0, s16, alphaV0
+ str s0, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul s2, s18, alphaV2
+ str s2, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul s4, s20, alphaV0
+ str s4, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul s6, s22, alphaV2
+ str s6, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul s0, s24, alphaV0
+ str s0, [pCRow2]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul s2, s26, alphaV2
+ str s2, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul s4, s28, alphaV0
+ str s4, [pCRow2]
+
+
+ fmul s6, s30, alphaV2
+ str s6, [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+ fmov s16, wzr
+ fmov s17, wzr
+ fmov s20, wzr
+ fmov s21, s16
+ fmov s24, wzr
+ fmov s25, s16
+ fmov s28, wzr
+ fmov s29, s16
+.endm
+
+.macro KERNEL8x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.4s, v0.4s, v8.2s[0]
+ fmul v17.4s, v1.4s, v8.2s[0]
+ fmul v20.4s, v0.4s, v8.2s[1]
+ fmul v21.4s, v1.4s, v8.2s[1]
+ fmul v24.4s, v0.4s, v9.2s[0]
+ fmul v25.4s, v1.4s, v9.2s[0]
+ fmul v28.4s, v0.4s, v9.2s[1]
+ fmul v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v5.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+ fmla v16.4s, v4.4s, v12.2s[0]
+ fmla v17.4s, v5.4s, v12.2s[0]
+ fmla v20.4s, v4.4s, v12.2s[1]
+ fmla v21.4s, v5.4s, v12.2s[1]
+ fmla v24.4s, v4.4s, v13.2s[0]
+ fmla v25.4s, v5.4s, v13.2s[0]
+ fmla v28.4s, v4.4s, v13.2s[1]
+ fmla v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v24.4s, v0.4s, v9.2s[0]
+ fmla v25.4s, v1.4s, v9.2s[0]
+ fmla v28.4s, v0.4s, v9.2s[1]
+ fmla v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+ add pCRow1, pCRow0, LDC
+
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v4.4s, v20.4s, alphaV0
+ fmul v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v0.4s, v24.4s, alphaV0
+ fmul v1.4s, v25.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow2]
+
+
+ fmul v4.4s, v28.4s, alphaV0
+ fmul v5.4s, v29.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x4
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+ fmov s24, s17
+ fmov s25, s16
+ fmov s28, s17
+ fmov s29, s16
+.endm
+
+.macro KERNEL4x4_I
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmul v16.2s, v0.2s, v8.2s[0]
+ fmul v29.2s, v1.2s, v9.2s[1]
+
+ fmul v20.2s, v0.2s, v8.2s[1]
+ fmul v25.2s, v1.2s, v9.2s[0]
+
+ fmul v24.2s, v0.2s, v9.2s[0]
+ fmul v21.2s, v1.2s, v8.2s[1]
+
+ fmul v28.2s, v0.2s, v9.2s[1]
+ fmul v17.2s, v1.2s, v8.2s[0]
+
+ ld1 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v4.2s, v5.2s}, [pA]
+ add pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ ld1 {v12.2s, v13.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ ld1 {v4.2s, v5.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ prfm PLDL1KEEP, [pB, #512]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ ld1 {v8.2s, v9.2s}, [pB] // For next round
+ add pB, pB, #16
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ ld1 {v0.2s, v1.2s}, [pA] // For next round
+ add pA, pA, #16
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ prfm PLDL1KEEP, [pA, #512]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+ fmla v16.2s, v4.2s, v12.2s[0]
+ fmla v29.2s, v5.2s, v13.2s[1]
+
+ fmla v20.2s, v4.2s, v12.2s[1]
+ fmla v25.2s, v5.2s, v13.2s[0]
+
+ fmla v24.2s, v4.2s, v13.2s[0]
+ fmla v21.2s, v5.2s, v12.2s[1]
+
+ fmla v28.2s, v4.2s, v13.2s[1]
+ fmla v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v29.2s, v1.2s, v9.2s[1]
+
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v25.2s, v1.2s, v9.2s[0]
+
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v21.2s, v1.2s, v8.2s[1]
+
+ fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+
+ fmul v8.2s, v16.2s, alphaV0
+ fmul v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2s, v20.2s, alphaV2
+ fmul v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v8.2s, v24.2s, alphaV0
+ fmul v9.2s, v25.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2s, v28.2s, alphaV2
+ fmul v13.2s, v29.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+ fmov s16, wzr
+ fmov s20, s16
+ fmov s24, s20
+ fmov s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.2s[0]
+ fmla v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+
+ fmul v8.2s, v24.2s, alphaV2
+ st1 {v8.2s}, [pCRow2]
+
+ add pCRow1, pCRow2, LDC
+
+ fmul v12.2s, v28.2s, alphaV3
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+ ldr s0, [pA]
+ add pA, pA, #4
+
+ ld1 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
+ fmla v16.2s, v8.2s, v0.s[0]
+ fmla v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+ add pCRow1, pCRow0, LDC
+
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow2, LDC
+
+
+ fmul v12.2s, v20.2s, alphaV1
+ st1 {v12.s}[0], [pCRow2]
+ st1 {v12.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+
+ fmla v20.4s, v0.4s, v8.2s[1]
+ fmla v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+ add pCRow1, pCRow0, LDC
+
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow2, pCRow1, LDC
+
+
+ fmul v4.4s, v20.4s, alphaV0
+ fmul v5.4s, v21.4s, alphaV1
+ st1 {v4.4s, v5.4s}, [pCRow1]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+ fmov s16, wzr
+ fmov s17, s16
+ fmov s20, s17
+ fmov s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+
+ fmul v8.2s, v16.2s, alphaV0
+ fmul v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow1, pCRow0, LDC
+
+ fmul v12.2s, v20.2s, alphaV2
+ fmul v13.2s, v21.2s, alphaV3
+ st1 {v12.2s, v13.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+ fmov s16, wzr
+ fmov s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+ ld1 {v8.2s}, [pB]
+ add pB, pB, #8
+
+ ld1 {v0.2s}, [pA]
+ add pA, pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow1 , pCRow0, LDC
+
+ fmul v12.2s, v20.2s, alphaV1
+ st1 {v12.2s}, [pCRow1]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+ ld1 {v8.2s} , [pB]
+ add pB , pB, #8
+
+ ldr s0 , [pA]
+ add pA, pA, #4
+
+ fmla v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+ add pCRow1 , pCRow0, LDC
+
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.s}[0], [pCRow0]
+ st1 {v8.s}[1], [pCRow1]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+ fmov s16, wzr
+ fmov s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.4s}, [pA]
+ add pA, pA, #16
+ ld1 {v1.4s}, [pA]
+ add pA, pA, #16
+
+ fmla v16.4s, v0.4s, v8.2s[0]
+ fmla v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+
+ fmul v0.4s, v16.4s, alphaV0
+ fmul v1.4s, v17.4s, alphaV1
+ st1 {v0.4s, v1.4s}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+ fmov s16, wzr
+ fmov s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s, v1.2s}, [pA]
+ add pA , pA, #16
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+
+ fmul v8.2s, v16.2s, alphaV0
+ fmul v9.2s, v17.2s, alphaV1
+ st1 {v8.2s, v9.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ld1 {v0.2s}, [pA]
+ add pA , pA, #8
+
+ fmla v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+
+ fmul v8.2s, v16.2s, alphaV0
+ st1 {v8.2s}, [pCRow0]
+
+ add pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+ fmov s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+ ldr s8, [pB]
+ add pB , pB, #4
+
+ ldr s0, [pA]
+ add pA , pA, #4
+
+ fmadd s16, s0, s8, s16
+.endm
+
+.macro SAVE1x1
+
+ fmul s8, s16, alpha0
+ str s8, [pCRow0]
+
+ add pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+strmm_kernel_begin:
+
+ .align 5
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+
+ fmov alpha0, s0
+ fmov alpha1, s0
+ fmov alpha2, s0
+ fmov alpha3, s0
+
+ lsl LDC, LDC, #2 // ldc = ldc * 4
+
+#if !defined(LEFT)
+ neg tempOffset, offset
+#endif
+ mov pB, origPB
+
+ mov counterJ, origN
+ asr counterJ, counterJ, #3 // J = J / 8
+ cmp counterJ, #0
+ ble strmm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L8_BEGIN:
+ mov pCRow0, pC // pCRow0 = C
+ add pC, pC, LDC, lsl #3
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+
+ mov pA, origPA // pA = start of A array
+
+/******************************************************************************/
+
+strmm_kernel_L8_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble strmm_kernel_L8_M4_BEGIN
+
+strmm_kernel_L8_M8_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #8
+#endif
+
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt strmm_kernel_L8_M8_32
+
+ KERNEL8x8_I // do one in the K
+ KERNEL8x8_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble strmm_kernel_L8_M8_22a
+ .align 5
+
+strmm_kernel_L8_M8_22:
+
+ KERNEL8x8_M1
+ KERNEL8x8_M2
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L8_M8_22
+
+strmm_kernel_L8_M8_22a:
+
+ KERNEL8x8_M1
+ KERNEL8x8_E
+
+ b strmm_kernel_L8_M8_44
+
+strmm_kernel_L8_M8_32:
+
+ tst counterL, #1
+ ble strmm_kernel_L8_M8_40
+
+ KERNEL8x8_I
+ KERNEL8x8_E
+
+ b strmm_kernel_L8_M8_44
+
+strmm_kernel_L8_M8_40:
+
+ INIT8x8
+
+strmm_kernel_L8_M8_44:
+
+ ands counterL , tempK, #1
+ ble strmm_kernel_L8_M8_100
+
+strmm_kernel_L8_M8_46:
+
+ KERNEL8x8_SUB
+
+strmm_kernel_L8_M8_100:
+
+ SAVE8x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #8
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L8_M8_END:
+ subs counterI, counterI, #1
+ bne strmm_kernel_L8_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L8_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble strmm_kernel_L8_END
+
+ tst counterI, #4
+ ble strmm_kernel_L8_M2_BEGIN
+
+strmm_kernel_L8_M4_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #8
+#endif
+
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt strmm_kernel_L8_M4_32
+
+ KERNEL4x8_I // do one in the K
+ KERNEL4x8_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble strmm_kernel_L8_M4_22a
+ .align 5
+
+strmm_kernel_L8_M4_22:
+
+ KERNEL4x8_M1
+ KERNEL4x8_M2
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L8_M4_22
+
+strmm_kernel_L8_M4_22a:
+
+ KERNEL4x8_M1
+ KERNEL4x8_E
+
+ b strmm_kernel_L8_M4_44
+
+strmm_kernel_L8_M4_32:
+
+ tst counterL, #1
+ ble strmm_kernel_L8_M4_40
+
+ KERNEL4x8_I
+ KERNEL4x8_E
+
+ b strmm_kernel_L8_M4_44
+
+strmm_kernel_L8_M4_40:
+
+ INIT4x8
+
+strmm_kernel_L8_M4_44:
+
+ ands counterL , tempK, #1
+ ble strmm_kernel_L8_M4_100
+
+strmm_kernel_L8_M4_46:
+
+ KERNEL4x8_SUB
+
+strmm_kernel_L8_M4_100:
+
+ SAVE4x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #8
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+strmm_kernel_L8_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L8_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble strmm_kernel_L8_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble strmm_kernel_L8_M1_BEGIN
+
+strmm_kernel_L8_M2_20:
+
+ INIT2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #8
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L8_M2_40
+
+strmm_kernel_L8_M2_22:
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L8_M2_22
+
+
+strmm_kernel_L8_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L8_M2_100
+
+strmm_kernel_L8_M2_42:
+
+ KERNEL2x8_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L8_M2_42
+
+strmm_kernel_L8_M2_100:
+
+ SAVE2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #8
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+strmm_kernel_L8_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L8_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble strmm_kernel_L8_END
+
+strmm_kernel_L8_M1_20:
+
+ INIT1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #2
+ add pA, pA, temp
+ lsl temp, tempOffset, #5
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #8
+#endif
+
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L8_M1_40
+
+strmm_kernel_L8_M1_22:
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L8_M1_22
+
+
+strmm_kernel_L8_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L8_M1_100
+
+strmm_kernel_L8_M1_42:
+
+ KERNEL1x8_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L8_M1_42
+
+strmm_kernel_L8_M1_100:
+
+ SAVE1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #8
+#endif
+ lsl temp, tempK, #2
+ add pA, pA, temp
+ lsl temp, tempK, #5
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+
+strmm_kernel_L8_END:
+ lsl temp, origK, #5 // B = B + K * 4 * 8
+ add origPB, origPB, temp
+
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+
+ subs counterJ, counterJ , #1 // j--
+ bgt strmm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L4_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #7
+ ble strmm_kernel_L999
+
+ tst counterJ , #4
+ ble strmm_kernel_L2_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #2
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+
+ mov pA, origPA // pA = A
+
+/******************************************************************************/
+
+strmm_kernel_L4_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI, #0
+ ble strmm_kernel_L4_M4_BEGIN
+
+strmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #4
+#endif
+
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt strmm_kernel_L4_M8_32
+
+ KERNEL8x4_I // do one in the K
+ KERNEL8x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble strmm_kernel_L4_M8_22a
+ .align 5
+
+strmm_kernel_L4_M8_22:
+
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M8_22
+
+strmm_kernel_L4_M8_22a:
+
+ KERNEL8x4_M1
+ KERNEL8x4_E
+
+ b strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_32:
+
+ tst counterL, #1
+ ble strmm_kernel_L4_M8_40
+
+ KERNEL8x4_I
+ KERNEL8x4_E
+
+ b strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_40:
+
+ INIT8x4
+
+strmm_kernel_L4_M8_44:
+
+ ands counterL , tempK, #1
+ ble strmm_kernel_L4_M8_100
+
+strmm_kernel_L4_M8_46:
+
+ KERNEL8x4_SUB
+
+strmm_kernel_L4_M8_100:
+
+ SAVE8x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+strmm_kernel_L4_M8_END:
+ subs counterI, counterI, #1
+ bne strmm_kernel_L4_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L4_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble strmm_kernel_L4_END
+
+ tst counterI, #4
+ ble strmm_kernel_L4_M2_BEGIN
+
+strmm_kernel_L4_M4_20:
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt strmm_kernel_L4_M4_32
+
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
+
+ subs counterL, counterL, #2
+ ble strmm_kernel_L4_M4_22a
+ .align 5
+
+strmm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M4_22
+
+strmm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_32:
+
+ tst counterL, #1
+ ble strmm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_E
+
+ b strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_40:
+
+ INIT4x4
+
+strmm_kernel_L4_M4_44:
+
+ ands counterL , tempK, #1
+ ble strmm_kernel_L4_M4_100
+
+strmm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+strmm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L4_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L4_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble strmm_kernel_L4_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble strmm_kernel_L4_M1_BEGIN
+
+strmm_kernel_L4_M2_20:
+
+ INIT2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L4_M2_40
+
+strmm_kernel_L4_M2_22:
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M2_22
+
+
+strmm_kernel_L4_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L4_M2_100
+
+strmm_kernel_L4_M2_42:
+
+ KERNEL2x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M2_42
+
+strmm_kernel_L4_M2_100:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L4_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L4_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble strmm_kernel_L4_END
+
+strmm_kernel_L4_M1_20:
+
+ INIT1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #4
+ add pB, pB, temp
+ lsl temp, tempOffset, #2
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #4
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L4_M1_40
+
+strmm_kernel_L4_M1_22:
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M1_22
+
+
+strmm_kernel_L4_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L4_M1_100
+
+strmm_kernel_L4_M1_42:
+
+ KERNEL1x4_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L4_M1_42
+
+strmm_kernel_L4_M1_100:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #4
+#endif
+ lsl temp, tempK, #2
+ add pA, pA, temp
+ lsl temp, tempK, #4
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L4_END:
+ add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L2_BEGIN: // less than 2 left in N direction
+
+ mov counterJ , origN
+ tst counterJ , #3
+ ble strmm_kernel_L999
+
+ tst counterJ , #2
+ ble strmm_kernel_L1_BEGIN
+
+ mov pCRow0, pC // pCRow0 = pC
+
+ add pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+/******************************************************************************/
+
+strmm_kernel_L2_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3 // counterI = counterI / 8
+ cmp counterI,#0
+ ble strmm_kernel_L2_M4_BEGIN
+
+strmm_kernel_L2_M8_20:
+
+ INIT8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble strmm_kernel_L2_M8_40
+ .align 5
+
+strmm_kernel_L2_M8_22:
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M8_22
+
+
+strmm_kernel_L2_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M8_100
+
+strmm_kernel_L2_M8_42:
+
+ KERNEL8x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M8_42
+
+strmm_kernel_L2_M8_100:
+
+ SAVE8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+strmm_kernel_L2_M8_END:
+
+ subs counterI, counterI, #1
+ bgt strmm_kernel_L2_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L2_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble strmm_kernel_L2_END
+
+ tst counterI, #4
+ ble strmm_kernel_L2_M2_BEGIN
+
+strmm_kernel_L2_M4_20:
+
+ INIT4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble strmm_kernel_L2_M4_40
+ .align 5
+
+strmm_kernel_L2_M4_22:
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M4_22
+
+
+strmm_kernel_L2_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M4_100
+
+strmm_kernel_L2_M4_42:
+
+ KERNEL4x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M4_42
+
+strmm_kernel_L2_M4_100:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L2_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L2_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble strmm_kernel_L2_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble strmm_kernel_L2_M1_BEGIN
+
+strmm_kernel_L2_M2_20:
+
+ INIT2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL,#0
+ ble strmm_kernel_L2_M2_40
+
+strmm_kernel_L2_M2_22:
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M2_22
+
+
+strmm_kernel_L2_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M2_100
+
+strmm_kernel_L2_M2_42:
+
+ KERNEL2x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M2_42
+
+strmm_kernel_L2_M2_100:
+
+ SAVE2x2
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+
+strmm_kernel_L2_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L2_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble strmm_kernel_L2_END
+
+strmm_kernel_L2_M1_20:
+
+ INIT1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #3
+ add pB, pB, temp
+ lsl temp, tempOffset, #2
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #2
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL, #0
+ ble strmm_kernel_L2_M1_40
+
+strmm_kernel_L2_M1_22:
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M1_22
+
+
+strmm_kernel_L2_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L2_M1_100
+
+strmm_kernel_L2_M1_42:
+
+ KERNEL1x2_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L2_M1_42
+
+strmm_kernel_L2_M1_100:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #1
+#else
+ sub tempK, tempK, #2
+#endif
+ lsl temp, tempK, #2
+ add pA, pA, temp
+ lsl temp, tempK, #3
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L2_END:
+#if !defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+ add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L1_BEGIN:
+
+ mov counterJ , origN
+ tst counterJ , #1
+ ble strmm_kernel_L999 // done
+
+
+ mov pCRow0, pC // pCRow0 = C
+ add pC , pC , LDC // Update pC to point to next
+
+#if defined(LEFT)
+ mov tempOffset, offset
+#endif
+ mov pA, origPA // pA = A
+
+/******************************************************************************/
+
+strmm_kernel_L1_M8_BEGIN:
+
+ mov counterI, origM
+ asr counterI, counterI, #3
+ cmp counterI, #0
+ ble strmm_kernel_L1_M4_BEGIN
+
+strmm_kernel_L1_M8_20:
+
+ INIT8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #5
+ add pA, pA, temp
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #8
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M8_40
+ .align 5
+
+strmm_kernel_L1_M8_22:
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M8_22
+
+
+strmm_kernel_L1_M8_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M8_100
+
+strmm_kernel_L1_M8_42:
+
+ KERNEL8x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M8_42
+
+strmm_kernel_L1_M8_100:
+
+ SAVE8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #8
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #5
+ add pA, pA, temp
+ lsl temp, tempK, #2
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #8
+#endif
+strmm_kernel_L1_M8_END:
+
+ subs counterI, counterI, #1
+ bgt strmm_kernel_L1_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L1_M4_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #7
+ ble strmm_kernel_L1_END
+
+ tst counterI, #4
+ ble strmm_kernel_L1_M2_BEGIN
+
+strmm_kernel_L1_M4_20:
+
+ INIT4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+ lsl temp, tempOffset, #4
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #4
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M4_40
+ .align 5
+
+strmm_kernel_L1_M4_22:
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M4_22
+
+
+strmm_kernel_L1_M4_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M4_100
+
+strmm_kernel_L1_M4_42:
+
+ KERNEL4x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M4_42
+
+strmm_kernel_L1_M4_100:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #4
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #4
+ add pA, pA, temp
+ lsl temp, tempK, #2
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L1_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L1_M2_BEGIN:
+
+ mov counterI, origM
+ tst counterI , #3
+ ble strmm_kernel_L1_END
+
+ tst counterI, #2 // counterI = counterI / 2
+ ble strmm_kernel_L1_M1_BEGIN
+
+strmm_kernel_L1_M2_20:
+
+ INIT2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+ lsl temp, tempOffset, #3
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #2
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M2_40
+
+strmm_kernel_L1_M2_22:
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M2_22
+
+
+strmm_kernel_L1_M2_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M2_100
+
+strmm_kernel_L1_M2_42:
+
+ KERNEL2x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M2_42
+
+strmm_kernel_L1_M2_100:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub tempK, origK, tempOffset
+#if defined(LEFT)
+ sub tempK, tempK, #2
+#else
+ sub tempK, tempK, #1
+#endif
+ lsl temp, tempK, #3
+ add pA, pA, temp
+ lsl temp, tempK, #2
+ add pB, pB, temp
+#endif
+#if defined(LEFT)
+ add tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L1_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L1_M1_BEGIN:
+
+ tst counterI, #1 // counterI = counterI % 2
+ ble strmm_kernel_L1_END
+
+strmm_kernel_L1_M1_20:
+
+ INIT1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov pB, origPB
+#else
+ mov pB, origPB
+ lsl temp, tempOffset, #2
+ add pB, pB, temp
+ lsl temp, tempOffset, #2
+ add pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub tempK, origK, tempOffset
+#elif defined(LEFT)
+ add tempK, tempOffset, #1
+#else
+ add tempK, tempOffset, #1
+#endif
+ asr counterL , tempK, #3 // counterL = counterL / 8
+ cmp counterL , #0
+ ble strmm_kernel_L1_M1_40
+
+strmm_kernel_L1_M1_22:
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M1_22
+
+
+strmm_kernel_L1_M1_40:
+
+ ands counterL , tempK, #7 // counterL = counterL % 8
+ ble strmm_kernel_L1_M1_100
+
+strmm_kernel_L1_M1_42:
+
+ KERNEL1x1_SUB
+
+ subs counterL, counterL, #1
+ bgt strmm_kernel_L1_M1_42
+
+strmm_kernel_L1_M1_100:
+
+ SAVE1x1
+
+strmm_kernel_L1_END:
+
+/******************************************************************************/
+
+strmm_kernel_L999:
+ mov x0, #0 // set return value
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+ ret
+
+ EPILOGUE
+
diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL
index cb9ed848b..c3c86b310 100644
--- a/kernel/power/KERNEL
+++ b/kernel/power/KERNEL
@@ -1,57 +1,3 @@
-SGEMM_BETA = gemm_beta.S
-DGEMM_BETA = gemm_beta.S
-CGEMM_BETA = zgemm_beta.S
-ZGEMM_BETA = zgemm_beta.S
-
-
-ifndef SSYMV_U_KERNEL
-SSYMV_U_KERNEL = symv_U.S
-endif
-
-ifndef SSYMV_L_KERNEL
-SSYMV_L_KERNEL = symv_L.S
-endif
-
-ifndef DSYMV_U_KERNEL
-DSYMV_U_KERNEL = symv_U.S
-endif
-
-ifndef DSYMV_L_KERNEL
-DSYMV_L_KERNEL = symv_L.S
-endif
-
-ifndef CSYMV_U_KERNEL
-CSYMV_U_KERNEL = zsymv_U.S
-endif
-
-ifndef CSYMV_L_KERNEL
-CSYMV_L_KERNEL = zsymv_L.S
-endif
-
-ifndef ZSYMV_U_KERNEL
-ZSYMV_U_KERNEL = zsymv_U.S
-endif
-
-ifndef ZSYMV_L_KERNEL
-ZSYMV_L_KERNEL = zsymv_L.S
-endif
-
-ifndef CHEMV_U_KERNEL
-CHEMV_U_KERNEL = zsymv_U.S
-endif
-
-ifndef CHEMV_L_KERNEL
-CHEMV_L_KERNEL = zsymv_L.S
-endif
-
-ifndef ZHEMV_U_KERNEL
-ZHEMV_U_KERNEL = zsymv_U.S
-endif
-
-ifndef ZHEMV_L_KERNEL
-ZHEMV_L_KERNEL = zsymv_L.S
-endif
-
ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
endif
@@ -84,3 +30,23 @@ ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta.S
+endif
+
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta.S
+endif
+
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta.S
+endif
+
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta.S
+endif
+
+ifndef DSDOTKERNEL
+DSDOTKERNEL = ../generic/dot.c
+endif
+
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
new file mode 100644
index 000000000..760d568cd
--- /dev/null
+++ b/kernel/power/KERNEL.POWER8
@@ -0,0 +1,175 @@
+#SGEMM_BETA = ../generic/gemm_beta.c
+#DGEMM_BETA = ../generic/gemm_beta.c
+#CGEMM_BETA = ../generic/zgemm_beta.c
+#ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL = gemm_kernel_power6.S
+DTRMMKERNEL = dtrmm_kernel_16x4_power8.S
+CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
+
+SGEMMKERNEL = gemm_kernel_power6.S
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+
+DGEMMKERNEL = dgemm_kernel_16x4_power8.S
+DGEMMINCOPY = ../generic/gemm_ncopy_16.c
+DGEMMITCOPY = ../generic/gemm_tcopy_16.c
+DGEMMONCOPY = gemm_ncopy_4.S
+DGEMMOTCOPY = gemm_tcopy_4.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+
+ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+ZGEMMINCOPYOBJ = zgemm_incopy.o
+ZGEMMITCOPYOBJ = zgemm_itcopy.o
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
+#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
+#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
+
+#Pure C for other kernels
+#SAMAXKERNEL = ../arm/amax.c
+#DAMAXKERNEL = ../arm/amax.c
+#CAMAXKERNEL = ../arm/zamax.c
+#ZAMAXKERNEL = ../arm/zamax.c
+#
+#SAMINKERNEL = ../arm/amin.c
+#DAMINKERNEL = ../arm/amin.c
+#CAMINKERNEL = ../arm/zamin.c
+#ZAMINKERNEL = ../arm/zamin.c
+#
+#SMAXKERNEL = ../arm/max.c
+#DMAXKERNEL = ../arm/max.c
+#
+#SMINKERNEL = ../arm/min.c
+#DMINKERNEL = ../arm/min.c
+#
+#ISAMAXKERNEL = ../arm/iamax.c
+#IDAMAXKERNEL = ../arm/iamax.c
+#ICAMAXKERNEL = ../arm/izamax.c
+#IZAMAXKERNEL = ../arm/izamax.c
+#
+#ISAMINKERNEL = ../arm/iamin.c
+#IDAMINKERNEL = ../arm/iamin.c
+#ICAMINKERNEL = ../arm/izamin.c
+#IZAMINKERNEL = ../arm/izamin.c
+#
+#ISMAXKERNEL = ../arm/imax.c
+#IDMAXKERNEL = ../arm/imax.c
+#
+#ISMINKERNEL = ../arm/imin.c
+#IDMINKERNEL = ../arm/imin.c
+#
+#SASUMKERNEL = ../arm/asum.c
+#DASUMKERNEL = ../arm/asum.c
+#CASUMKERNEL = ../arm/zasum.c
+#ZASUMKERNEL = ../arm/zasum.c
+#
+#SAXPYKERNEL = ../arm/axpy.c
+#DAXPYKERNEL = ../arm/axpy.c
+#CAXPYKERNEL = ../arm/zaxpy.c
+#ZAXPYKERNEL = ../arm/zaxpy.c
+#
+#SCOPYKERNEL = ../arm/copy.c
+#DCOPYKERNEL = ../arm/copy.c
+#CCOPYKERNEL = ../arm/zcopy.c
+#ZCOPYKERNEL = ../arm/zcopy.c
+#
+#SDOTKERNEL = ../arm/dot.c
+#DDOTKERNEL = ../arm/dot.c
+#CDOTKERNEL = ../arm/zdot.c
+#ZDOTKERNEL = ../arm/zdot.c
+#
+#SNRM2KERNEL = ../arm/nrm2.c
+#DNRM2KERNEL = ../arm/nrm2.c
+#CNRM2KERNEL = ../arm/znrm2.c
+#ZNRM2KERNEL = ../arm/znrm2.c
+#
+#SROTKERNEL = ../arm/rot.c
+#DROTKERNEL = ../arm/rot.c
+#CROTKERNEL = ../arm/zrot.c
+#ZROTKERNEL = ../arm/zrot.c
+#
+#SSCALKERNEL = ../arm/scal.c
+#DSCALKERNEL = ../arm/scal.c
+#CSCALKERNEL = ../arm/zscal.c
+#ZSCALKERNEL = ../arm/zscal.c
+#
+#SSWAPKERNEL = ../arm/swap.c
+#DSWAPKERNEL = ../arm/swap.c
+#CSWAPKERNEL = ../arm/zswap.c
+#ZSWAPKERNEL = ../arm/zswap.c
+#
+
+#SGEMVNKERNEL = ../arm/gemv_n.c
+#DGEMVNKERNEL = ../arm/gemv_n.c
+#CGEMVNKERNEL = ../arm/zgemv_n.c
+#ZGEMVNKERNEL = ../arm/zgemv_n.c
+#
+#SGEMVTKERNEL = ../arm/gemv_t.c
+#DGEMVTKERNEL = ../arm/gemv_t.c
+#CGEMVTKERNEL = ../arm/zgemv_t.c
+#ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+
+#SSYMV_U_KERNEL = ../generic/symv_k.c
+#SSYMV_L_KERNEL = ../generic/symv_k.c
+#DSYMV_U_KERNEL = ../generic/symv_k.c
+#DSYMV_L_KERNEL = ../generic/symv_k.c
+#QSYMV_U_KERNEL = ../generic/symv_k.c
+#QSYMV_L_KERNEL = ../generic/symv_k.c
+#CSYMV_U_KERNEL = ../generic/zsymv_k.c
+#CSYMV_L_KERNEL = ../generic/zsymv_k.c
+#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
+#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
+#XSYMV_U_KERNEL = ../generic/zsymv_k.c
+#XSYMV_L_KERNEL = ../generic/zsymv_k.c
+
+#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
+#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL = ../generic/cabs.c
+DCABS_KERNEL = ../generic/cabs.c
+QCABS_KERNEL = ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/power/def_vsx.h b/kernel/power/def_vsx.h
new file mode 100644
index 000000000..c2d29e268
--- /dev/null
+++ b/kernel/power/def_vsx.h
@@ -0,0 +1,64 @@
+#define vs0 0
+#define vs1 1
+#define vs2 2
+#define vs3 3
+#define vs4 4
+#define vs5 5
+#define vs6 6
+#define vs7 7
+#define vs8 8
+#define vs9 9
+#define vs10 10
+#define vs11 11
+#define vs12 12
+#define vs13 13
+#define vs14 14
+#define vs15 15
+#define vs16 16
+#define vs17 17
+#define vs18 18
+#define vs19 19
+#define vs20 20
+#define vs21 21
+#define vs22 22
+#define vs23 23
+#define vs24 24
+#define vs25 25
+#define vs26 26
+#define vs27 27
+#define vs28 28
+#define vs29 29
+#define vs30 30
+#define vs31 31
+#define vs32 32
+#define vs33 33
+#define vs34 34
+#define vs35 35
+#define vs36 36
+#define vs37 37
+#define vs38 38
+#define vs39 39
+#define vs40 40
+#define vs41 41
+#define vs42 42
+#define vs43 43
+#define vs44 44
+#define vs45 45
+#define vs46 46
+#define vs47 47
+#define vs48 48
+#define vs49 49
+#define vs50 50
+#define vs51 51
+#define vs52 52
+#define vs53 53
+#define vs54 54
+#define vs55 55
+#define vs56 56
+#define vs57 57
+#define vs58 58
+#define vs59 59
+#define vs60 60
+#define vs61 61
+#define vs62 62
+#define vs63 63
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
new file mode 100644
index 000000000..c67f31160
--- /dev/null
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -0,0 +1,348 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP 296(SP)
+#define FZERO 304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP 224(SP)
+#define FZERO 232(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r8
+#define B r9
+#define C r10
+#define LDC r7
+#define OFFSET r6
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0 0
+
+#define o8 r15
+#define o24 r16
+#define ALPHA r17
+#define L r18
+#define T1 r19
+#define KK r20
+#define BB r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o16 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T2 r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+#endif
+
+ stfd f1, ALPHA_SP
+ stw r0, FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+ slwi LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#else
+ lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+
+ cmpwi cr0, M, 0
+ ble .L999_H1
+ cmpwi cr0, N, 0
+ ble .L999_H1
+ cmpwi cr0, K, 0
+ ble .L999_H1
+
+#ifdef __64BIT__
+ addi ALPHA, SP, 296
+#else
+ addi ALPHA, SP, 224
+#endif
+
+ li PRE, 256
+ li o8 , 8
+ li o16, 16
+ li o24, 24
+ li o32, 32
+ li o48, 48
+
+ lxvdsx alpha_r, 0, ALPHA
+
+#include "dgemm_logic_16x4_power8.S"
+
+.L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S
new file mode 100644
index 000000000..49c438f61
--- /dev/null
+++ b/kernel/power/dgemm_logic_16x4_power8.S
@@ -0,0 +1,1683 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. J, N, 2
+ ble .LDGEMM_L4_END
+
+.LDGEMM_L4_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+ srawi. I, M, 4
+ ble .LDGEMM_L4x16_END
+
+.LDGEMM_L4x16_BEGIN:
+
+
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L4x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L4x16_SUB4
+
+.LDGEMM_L4x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_I1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L4x16_LOOP_END
+
+ .align 5
+
+.LDGEMM_L4x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x16_LOOP
+
+.LDGEMM_L4x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ KERNEL4x16_E2
+
+ b .LDGEMM_L4x16_SUB1
+
+.LDGEMM_L4x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL4x16_SUBI1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+
+ b .LDGEMM_L4x16_SUB1
+
+.LDGEMM_L4x16_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x16_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L4x16_SAVE
+ b .LDGEMM_L4x16_SUB2
+
+.LDGEMM_L4x16_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L4x16_SAVE
+
+.LDGEMM_L4x16_SUB2:
+
+ KERNEL4x16_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x16_SUB2
+
+.LDGEMM_L4x16_SAVE:
+
+ SAVE4x16
+
+ addic. I, I, -1
+ bgt .LDGEMM_L4x16_BEGIN
+
+.LDGEMM_L4x16_END:
+
+.LDGEMM_L4x8_BEGIN:
+
+ andi. T2, M, 15
+ ble .LDGEMM_L4x1_END
+
+ andi. T1, M, 8
+ ble .LDGEMM_L4x8_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L4x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L4x8_SUB4
+
+.LDGEMM_L4x8_LOOP_START:
+
+ LOAD4x8_1
+ KERNEL4x8_I1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L4x8_LOOP_END
+
+ .align 5
+
+.LDGEMM_L4x8_LOOP:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x8_LOOP
+
+.LDGEMM_L4x8_LOOP_END:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_E2
+
+ b .LDGEMM_L4x8_SUB1
+
+.LDGEMM_L4x8_SUB4:
+
+ KERNEL4x8_SUBI1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ b .LDGEMM_L4x8_SUB1
+
+.LDGEMM_L4x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x8_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L4x8_SAVE
+ b .LDGEMM_L4x8_SUB2
+
+.LDGEMM_L4x8_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L4x8_SAVE
+
+.LDGEMM_L4x8_SUB2:
+
+ KERNEL4x8_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x8_SUB2
+
+.LDGEMM_L4x8_SAVE:
+
+ SAVE4x8
+
+.LDGEMM_L4x8_END:
+
+.LDGEMM_L4x4_BEGIN:
+
+
+ andi. T1, M, 4
+ ble .LDGEMM_L4x4_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L4x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L4x4_SUB4
+
+.LDGEMM_L4x4_LOOP_START:
+
+ LOAD4x4_1
+ KERNEL4x4_I1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L4x4_LOOP_END
+
+ .align 5
+
+.LDGEMM_L4x4_LOOP:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x4_LOOP
+
+.LDGEMM_L4x4_LOOP_END:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_E2
+
+ b .LDGEMM_L4x4_SUB1
+
+.LDGEMM_L4x4_SUB4:
+
+ KERNEL4x4_SUBI1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ b .LDGEMM_L4x4_SUB1
+
+.LDGEMM_L4x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x4_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L4x4_SAVE
+ b .LDGEMM_L4x4_SUB2
+
+.LDGEMM_L4x4_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L4x4_SAVE
+
+.LDGEMM_L4x4_SUB2:
+
+ KERNEL4x4_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x4_SUB2
+
+.LDGEMM_L4x4_SAVE:
+
+ SAVE4x4
+
+.LDGEMM_L4x4_END:
+
+.LDGEMM_L4x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble .LDGEMM_L4x2_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L4x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L4x2_SUB4
+
+.LDGEMM_L4x2_LOOP_START:
+
+ LOAD4x2_1
+ KERNEL4x2_I1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L4x2_LOOP_END
+
+ .align 5
+
+.LDGEMM_L4x2_LOOP:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x2_LOOP
+
+.LDGEMM_L4x2_LOOP_END:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_E2
+
+ b .LDGEMM_L4x2_SUB1
+
+.LDGEMM_L4x2_SUB4:
+
+ KERNEL4x2_SUBI1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ b .LDGEMM_L4x2_SUB1
+
+.LDGEMM_L4x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x2_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L4x2_SAVE
+ b .LDGEMM_L4x2_SUB2
+
+.LDGEMM_L4x2_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L4x2_SAVE
+
+.LDGEMM_L4x2_SUB2:
+
+ KERNEL4x2_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x2_SUB2
+
+.LDGEMM_L4x2_SAVE:
+
+ SAVE4x2
+
+.LDGEMM_L4x2_END:
+
+.LDGEMM_L4x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble .LDGEMM_L4x1_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L4x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L4x1_SUB4
+
+.LDGEMM_L4x1_LOOP_START:
+
+ LOAD4x1_1
+ KERNEL4x1_I1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L4x1_LOOP_END
+
+ .align 5
+
+.LDGEMM_L4x1_LOOP:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x1_LOOP
+
+.LDGEMM_L4x1_LOOP_END:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_E2
+
+ b .LDGEMM_L4x1_SUB1
+
+.LDGEMM_L4x1_SUB4:
+
+ KERNEL4x1_SUBI1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ b .LDGEMM_L4x1_SUB1
+
+.LDGEMM_L4x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x1_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L4x1_SAVE
+ b .LDGEMM_L4x1_SUB2
+
+.LDGEMM_L4x1_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L4x1_SAVE
+
+.LDGEMM_L4x1_SUB2:
+
+ KERNEL4x1_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L4x1_SUB2
+
+.LDGEMM_L4x1_SAVE:
+
+ SAVE4x1
+
+.LDGEMM_L4x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+ addic. J, J, -1
+ bgt .LDGEMM_L4_BEGIN
+
+ andi. T2, N, 3
+ ble .L999
+
+.LDGEMM_L4_END:
+
+ b .LDGEMM_L2_BEGIN
+
+.L999_H1:
+
+ b .L999
+
+.LDGEMM_L2_BEGIN:
+
+ andi. T1, N, 2
+ ble .LDGEMM_L2_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+ srawi. I, M, 4
+ ble .LDGEMM_L2x16_END
+
+.LDGEMM_L2x16_BEGIN:
+
+
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L2x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L2x16_SUB4
+
+.LDGEMM_L2x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_I1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L2x16_LOOP_END
+
+ .align 5
+
+.LDGEMM_L2x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x16_LOOP
+
+.LDGEMM_L2x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ KERNEL2x16_E2
+
+ b .LDGEMM_L2x16_SUB1
+
+.LDGEMM_L2x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL2x16_SUBI1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+
+ b .LDGEMM_L2x16_SUB1
+
+.LDGEMM_L2x16_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x16_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L2x16_SAVE
+ b .LDGEMM_L2x16_SUB2
+
+.LDGEMM_L2x16_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L2x16_SAVE
+
+.LDGEMM_L2x16_SUB2:
+
+ KERNEL2x16_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x16_SUB2
+
+.LDGEMM_L2x16_SAVE:
+
+ SAVE2x16
+
+ addic. I, I, -1
+ bgt .LDGEMM_L2x16_BEGIN
+
+.LDGEMM_L2x16_END:
+
+.LDGEMM_L2x8_BEGIN:
+
+ andi. T2, M, 15
+ ble .LDGEMM_L2x1_END
+
+ andi. T1, M, 8
+ ble .LDGEMM_L2x8_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L2x8_SUB4
+
+.LDGEMM_L2x8_LOOP_START:
+
+ LOAD2x8_1
+ KERNEL2x8_I1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L2x8_LOOP_END
+
+ .align 5
+
+.LDGEMM_L2x8_LOOP:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x8_LOOP
+
+.LDGEMM_L2x8_LOOP_END:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b .LDGEMM_L2x8_SUB1
+
+.LDGEMM_L2x8_SUB4:
+
+ KERNEL2x8_SUBI1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b .LDGEMM_L2x8_SUB1
+
+.LDGEMM_L2x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L2x8_SAVE
+ b .LDGEMM_L2x8_SUB2
+
+.LDGEMM_L2x8_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L2x8_SAVE
+
+.LDGEMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x8_SUB2
+
+.LDGEMM_L2x8_SAVE:
+
+ SAVE2x8
+
+.LDGEMM_L2x8_END:
+
+.LDGEMM_L2x4_BEGIN:
+
+
+ andi. T1, M, 4
+ ble .LDGEMM_L2x4_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L2x4_SUB4
+
+.LDGEMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L2x4_LOOP_END
+
+ .align 5
+
+.LDGEMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x4_LOOP
+
+.LDGEMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b .LDGEMM_L2x4_SUB1
+
+.LDGEMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b .LDGEMM_L2x4_SUB1
+
+.LDGEMM_L2x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L2x4_SAVE
+ b .LDGEMM_L2x4_SUB2
+
+.LDGEMM_L2x4_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L2x4_SAVE
+
+.LDGEMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x4_SUB2
+
+.LDGEMM_L2x4_SAVE:
+
+ SAVE2x4
+
+.LDGEMM_L2x4_END:
+
+.LDGEMM_L2x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble .LDGEMM_L2x2_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L2x2_SUB4
+
+.LDGEMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L2x2_LOOP_END
+
+ .align 5
+
+.LDGEMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x2_LOOP
+
+.LDGEMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b .LDGEMM_L2x2_SUB1
+
+.LDGEMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b .LDGEMM_L2x2_SUB1
+
+.LDGEMM_L2x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L2x2_SAVE
+ b .LDGEMM_L2x2_SUB2
+
+.LDGEMM_L2x2_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L2x2_SAVE
+
+.LDGEMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x2_SUB2
+
+.LDGEMM_L2x2_SAVE:
+
+ SAVE2x2
+
+.LDGEMM_L2x2_END:
+
+.LDGEMM_L2x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble .LDGEMM_L2x1_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L2x1_SUB4
+
+.LDGEMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L2x1_LOOP_END
+
+ .align 5
+
+.LDGEMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x1_LOOP
+
+.LDGEMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b .LDGEMM_L2x1_SUB1
+
+.LDGEMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b .LDGEMM_L2x1_SUB1
+
+.LDGEMM_L2x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L2x1_SAVE
+ b .LDGEMM_L2x1_SUB2
+
+.LDGEMM_L2x1_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L2x1_SAVE
+
+.LDGEMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L2x1_SUB2
+
+.LDGEMM_L2x1_SAVE:
+
+ SAVE2x1
+
+.LDGEMM_L2x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+
+.LDGEMM_L2_END:
+.LDGEMM_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble .LDGEMM_L1_END
+ mr CO, C
+ mr AO, A
+ srawi. I, M, 4
+ ble .LDGEMM_L1x16_END
+
+.LDGEMM_L1x16_BEGIN:
+
+
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L1x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L1x16_SUB4
+
+.LDGEMM_L1x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_I1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L1x16_LOOP_END
+
+ .align 5
+
+.LDGEMM_L1x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x16_LOOP
+
+.LDGEMM_L1x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ KERNEL1x16_E2
+
+ b .LDGEMM_L1x16_SUB1
+
+.LDGEMM_L1x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL1x16_SUBI1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+
+ b .LDGEMM_L1x16_SUB1
+
+.LDGEMM_L1x16_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x16_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L1x16_SAVE
+ b .LDGEMM_L1x16_SUB2
+
+.LDGEMM_L1x16_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L1x16_SAVE
+
+.LDGEMM_L1x16_SUB2:
+
+ KERNEL1x16_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x16_SUB2
+
+.LDGEMM_L1x16_SAVE:
+
+ SAVE1x16
+
+ addic. I, I, -1
+ bgt .LDGEMM_L1x16_BEGIN
+
+.LDGEMM_L1x16_END:
+
+.LDGEMM_L1x8_BEGIN:
+
+ andi. T2, M, 15
+ ble .LDGEMM_L1x1_END
+
+ andi. T1, M, 8
+ ble .LDGEMM_L1x8_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L1x8_SUB4
+
+.LDGEMM_L1x8_LOOP_START:
+
+ LOAD1x8_1
+ KERNEL1x8_I1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L1x8_LOOP_END
+
+ .align 5
+
+.LDGEMM_L1x8_LOOP:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x8_LOOP
+
+.LDGEMM_L1x8_LOOP_END:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b .LDGEMM_L1x8_SUB1
+
+.LDGEMM_L1x8_SUB4:
+
+ KERNEL1x8_SUBI1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b .LDGEMM_L1x8_SUB1
+
+.LDGEMM_L1x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L1x8_SAVE
+ b .LDGEMM_L1x8_SUB2
+
+.LDGEMM_L1x8_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L1x8_SAVE
+
+.LDGEMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x8_SUB2
+
+.LDGEMM_L1x8_SAVE:
+
+ SAVE1x8
+
+.LDGEMM_L1x8_END:
+
+.LDGEMM_L1x4_BEGIN:
+
+
+ andi. T1, M, 4
+ ble .LDGEMM_L1x4_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L1x4_SUB4
+
+.LDGEMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L1x4_LOOP_END
+
+ .align 5
+
+.LDGEMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x4_LOOP
+
+.LDGEMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b .LDGEMM_L1x4_SUB1
+
+.LDGEMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b .LDGEMM_L1x4_SUB1
+
+.LDGEMM_L1x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L1x4_SAVE
+ b .LDGEMM_L1x4_SUB2
+
+.LDGEMM_L1x4_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L1x4_SAVE
+
+.LDGEMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x4_SUB2
+
+.LDGEMM_L1x4_SAVE:
+
+ SAVE1x4
+
+.LDGEMM_L1x4_END:
+
+.LDGEMM_L1x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble .LDGEMM_L1x2_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L1x2_SUB4
+
+.LDGEMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L1x2_LOOP_END
+
+ .align 5
+
+.LDGEMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x2_LOOP
+
+.LDGEMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b .LDGEMM_L1x2_SUB1
+
+.LDGEMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b .LDGEMM_L1x2_SUB1
+
+.LDGEMM_L1x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L1x2_SAVE
+ b .LDGEMM_L1x2_SUB2
+
+.LDGEMM_L1x2_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L1x2_SAVE
+
+.LDGEMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x2_SUB2
+
+.LDGEMM_L1x2_SAVE:
+
+ SAVE1x2
+
+.LDGEMM_L1x2_END:
+
+.LDGEMM_L1x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble .LDGEMM_L1x1_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LDGEMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LDGEMM_L1x1_SUB4
+
+.LDGEMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble .LDGEMM_L1x1_LOOP_END
+
+ .align 5
+
+.LDGEMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x1_LOOP
+
+.LDGEMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b .LDGEMM_L1x1_SUB1
+
+.LDGEMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b .LDGEMM_L1x1_SUB1
+
+.LDGEMM_L1x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble .LDGEMM_L1x1_SAVE
+ b .LDGEMM_L1x1_SUB2
+
+.LDGEMM_L1x1_SUB1:
+
+ andi. L, K, 7
+ ble .LDGEMM_L1x1_SAVE
+
+.LDGEMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt .LDGEMM_L1x1_SUB2
+
+.LDGEMM_L1x1_SAVE:
+
+ SAVE1x1
+
+.LDGEMM_L1x1_END:
+
+.LDGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S
new file mode 100644
index 000000000..27c05e08e
--- /dev/null
+++ b/kernel/power/dgemm_macros_16x4_power8.S
@@ -0,0 +1,3435 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16 *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+.endm
+
+.macro SAVE4x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+ xvmaddadp vs4, vs52, alpha_r
+ xvmaddadp vs5, vs53, alpha_r
+ xvmaddadp vs6, vs54, alpha_r
+ xvmaddadp vs7, vs55, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+ xvmuldp vs4, vs52, alpha_r
+ xvmuldp vs5, vs53, alpha_r
+ xvmuldp vs6, vs54, alpha_r
+ xvmuldp vs7, vs55, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+ xvmaddadp vs12, vs60, alpha_r
+ xvmaddadp vs13, vs61, alpha_r
+ xvmaddadp vs14, vs62, alpha_r
+ xvmaddadp vs15, vs63, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+ xvmuldp vs12, vs60, alpha_r
+ xvmuldp vs13, vs61, alpha_r
+ xvmuldp vs14, vs62, alpha_r
+ xvmuldp vs15, vs63, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ dcbt T1, PRE
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4 *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2 *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1 *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs48, alpha_r
+#else
+ xsmuldp vs0, vs48, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs56, alpha_r
+#else
+ xsmuldp vs8, vs56, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16 *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro SAVE2x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4 *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2 *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1 *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16 *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro SAVE1x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4 *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2 *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1 *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
new file mode 100644
index 000000000..2294128a2
--- /dev/null
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -0,0 +1,362 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP 296(SP)
+#define FZERO 304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP 224(SP)
+#define FZERO 232(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r8
+#define B r9
+#define C r10
+#define LDC r7
+#define OFFSET r6
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0 0
+
+#define K1 r13
+#define KKK r14
+#define o8 r15
+#define o24 r16
+#define ALPHA r17
+#define L r18
+#define T1 r19
+#define KK r20
+#define BB r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o16 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T2 r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+ std r13, 288(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+ stw r14, 212(SP)
+ stw r13, 216(SP)
+#endif
+
+ stfd f1, ALPHA_SP
+ stw r0, FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+ slwi LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#else
+ lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+ mr KK, OFFSET
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK, KK
+#endif
+
+ cmpwi cr0, M, 0
+ ble .L999_H1
+ cmpwi cr0, N, 0
+ ble .L999_H1
+ cmpwi cr0, K, 0
+ ble .L999_H1
+
+#ifdef __64BIT__
+ addi ALPHA, SP, 296
+#else
+ addi ALPHA, SP, 224
+#endif
+
+ li PRE, 256
+ li o8 , 8
+ li o16, 16
+ li o24, 24
+ li o32, 32
+ li o48, 48
+
+ lxvdsx alpha_r, 0, ALPHA
+
+#include "dtrmm_logic_16x4_power8.S"
+
+.L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+ ld r13, 288(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+ lwz r14, 212(SP)
+ lwz r13, 216(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S
new file mode 100644
index 000000000..a4340c598
--- /dev/null
+++ b/kernel/power/dtrmm_logic_16x4_power8.S
@@ -0,0 +1,2239 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+
+ srawi. J, N, 2
+ ble .LDTRMM_L4_END
+
+.LDTRMM_L4_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble .LDTRMM_L4x16_END
+
+.LDTRMM_L4x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 7 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L4x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L4x16_SUB4
+
+.LDTRMM_L4x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_I1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L4x16_LOOP_END
+
+ .align 5
+
+.LDTRMM_L4x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x16_LOOP
+
+.LDTRMM_L4x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ KERNEL4x16_E2
+
+ b .LDTRMM_L4x16_SUB1
+
+.LDTRMM_L4x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL4x16_SUBI1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+
+ b .LDTRMM_L4x16_SUB1
+
+.LDTRMM_L4x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x16_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L4x16_SAVE
+ b .LDTRMM_L4x16_SUB2
+
+.LDTRMM_L4x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L4x16_SAVE
+
+.LDTRMM_L4x16_SUB2:
+
+ KERNEL4x16_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x16_SUB2
+
+.LDTRMM_L4x16_SAVE:
+
+ SAVE4x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LDTRMM_L4x16_BEGIN
+
+.LDTRMM_L4x16_END:
+
+.LDTRMM_L4x8_BEGIN:
+ andi. T2, M, 15
+ ble .LDTRMM_L4x1_END
+
+ andi. T1, M, 8
+ ble .LDTRMM_L4x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L4x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L4x8_SUB4
+
+.LDTRMM_L4x8_LOOP_START:
+
+ LOAD4x8_1
+ KERNEL4x8_I1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L4x8_LOOP_END
+
+ .align 5
+
+.LDTRMM_L4x8_LOOP:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x8_LOOP
+
+.LDTRMM_L4x8_LOOP_END:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_E2
+
+ b .LDTRMM_L4x8_SUB1
+
+.LDTRMM_L4x8_SUB4:
+
+ KERNEL4x8_SUBI1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ b .LDTRMM_L4x8_SUB1
+
+.LDTRMM_L4x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x8_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L4x8_SAVE
+ b .LDTRMM_L4x8_SUB2
+
+.LDTRMM_L4x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L4x8_SAVE
+
+.LDTRMM_L4x8_SUB2:
+
+ KERNEL4x8_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x8_SUB2
+
+.LDTRMM_L4x8_SAVE:
+
+ SAVE4x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x8_END:
+
+.LDTRMM_L4x4_BEGIN:
+
+ andi. T1, M, 4
+ ble .LDTRMM_L4x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L4x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L4x4_SUB4
+
+.LDTRMM_L4x4_LOOP_START:
+
+ LOAD4x4_1
+ KERNEL4x4_I1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L4x4_LOOP_END
+
+ .align 5
+
+.LDTRMM_L4x4_LOOP:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x4_LOOP
+
+.LDTRMM_L4x4_LOOP_END:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_E2
+
+ b .LDTRMM_L4x4_SUB1
+
+.LDTRMM_L4x4_SUB4:
+
+ KERNEL4x4_SUBI1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ b .LDTRMM_L4x4_SUB1
+
+.LDTRMM_L4x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x4_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L4x4_SAVE
+ b .LDTRMM_L4x4_SUB2
+
+.LDTRMM_L4x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L4x4_SAVE
+
+.LDTRMM_L4x4_SUB2:
+
+ KERNEL4x4_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x4_SUB2
+
+.LDTRMM_L4x4_SAVE:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x4_END:
+
+.LDTRMM_L4x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LDTRMM_L4x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L4x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L4x2_SUB4
+
+.LDTRMM_L4x2_LOOP_START:
+
+ LOAD4x2_1
+ KERNEL4x2_I1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L4x2_LOOP_END
+
+ .align 5
+
+.LDTRMM_L4x2_LOOP:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x2_LOOP
+
+.LDTRMM_L4x2_LOOP_END:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_E2
+
+ b .LDTRMM_L4x2_SUB1
+
+.LDTRMM_L4x2_SUB4:
+
+ KERNEL4x2_SUBI1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ b .LDTRMM_L4x2_SUB1
+
+.LDTRMM_L4x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x2_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L4x2_SAVE
+ b .LDTRMM_L4x2_SUB2
+
+.LDTRMM_L4x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L4x2_SAVE
+
+.LDTRMM_L4x2_SUB2:
+
+ KERNEL4x2_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x2_SUB2
+
+.LDTRMM_L4x2_SAVE:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x2_END:
+
+.LDTRMM_L4x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LDTRMM_L4x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L4x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L4x1_SUB4
+
+.LDTRMM_L4x1_LOOP_START:
+
+ LOAD4x1_1
+ KERNEL4x1_I1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L4x1_LOOP_END
+
+ .align 5
+
+.LDTRMM_L4x1_LOOP:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x1_LOOP
+
+.LDTRMM_L4x1_LOOP_END:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_E2
+
+ b .LDTRMM_L4x1_SUB1
+
+.LDTRMM_L4x1_SUB4:
+
+ KERNEL4x1_SUBI1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ b .LDTRMM_L4x1_SUB1
+
+.LDTRMM_L4x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x1_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L4x1_SAVE
+ b .LDTRMM_L4x1_SUB2
+
+.LDTRMM_L4x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L4x1_SAVE
+
+.LDTRMM_L4x1_SUB2:
+
+ KERNEL4x1_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L4x1_SUB2
+
+.LDTRMM_L4x1_SAVE:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in B
+#endif
+
+
+ addic. J, J, -1
+ bgt .LDTRMM_L4_BEGIN
+
+ andi. T2, N, 3
+ ble .L999
+
+.LDTRMM_L4_END:
+
+ b .LDTRMM_L2_BEGIN
+
+.L999_H1:
+
+ b .L999
+
+.LDTRMM_L2_BEGIN:
+
+ andi. T1, N, 2
+ ble .LDTRMM_L2_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble .LDTRMM_L2x16_END
+
+.LDTRMM_L2x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 7 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L2x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L2x16_SUB4
+
+.LDTRMM_L2x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_I1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L2x16_LOOP_END
+
+ .align 5
+
+.LDTRMM_L2x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x16_LOOP
+
+.LDTRMM_L2x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ KERNEL2x16_E2
+
+ b .LDTRMM_L2x16_SUB1
+
+.LDTRMM_L2x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL2x16_SUBI1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+
+ b .LDTRMM_L2x16_SUB1
+
+.LDTRMM_L2x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x16_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L2x16_SAVE
+ b .LDTRMM_L2x16_SUB2
+
+.LDTRMM_L2x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L2x16_SAVE
+
+.LDTRMM_L2x16_SUB2:
+
+ KERNEL2x16_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x16_SUB2
+
+.LDTRMM_L2x16_SAVE:
+
+ SAVE2x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LDTRMM_L2x16_BEGIN
+
+.LDTRMM_L2x16_END:
+
+.LDTRMM_L2x8_BEGIN:
+ andi. T2, M, 15
+ ble .LDTRMM_L2x1_END
+
+ andi. T1, M, 8
+ ble .LDTRMM_L2x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L2x8_SUB4
+
+.LDTRMM_L2x8_LOOP_START:
+
+ LOAD2x8_1
+ KERNEL2x8_I1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L2x8_LOOP_END
+
+ .align 5
+
+.LDTRMM_L2x8_LOOP:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x8_LOOP
+
+.LDTRMM_L2x8_LOOP_END:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b .LDTRMM_L2x8_SUB1
+
+.LDTRMM_L2x8_SUB4:
+
+ KERNEL2x8_SUBI1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b .LDTRMM_L2x8_SUB1
+
+.LDTRMM_L2x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L2x8_SAVE
+ b .LDTRMM_L2x8_SUB2
+
+.LDTRMM_L2x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L2x8_SAVE
+
+.LDTRMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x8_SUB2
+
+.LDTRMM_L2x8_SAVE:
+
+ SAVE2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x8_END:
+
+.LDTRMM_L2x4_BEGIN:
+
+ andi. T1, M, 4
+ ble .LDTRMM_L2x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L2x4_SUB4
+
+.LDTRMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L2x4_LOOP_END
+
+ .align 5
+
+.LDTRMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x4_LOOP
+
+.LDTRMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b .LDTRMM_L2x4_SUB1
+
+.LDTRMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b .LDTRMM_L2x4_SUB1
+
+.LDTRMM_L2x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L2x4_SAVE
+ b .LDTRMM_L2x4_SUB2
+
+.LDTRMM_L2x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L2x4_SAVE
+
+.LDTRMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x4_SUB2
+
+.LDTRMM_L2x4_SAVE:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x4_END:
+
+.LDTRMM_L2x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LDTRMM_L2x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L2x2_SUB4
+
+.LDTRMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L2x2_LOOP_END
+
+ .align 5
+
+.LDTRMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x2_LOOP
+
+.LDTRMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b .LDTRMM_L2x2_SUB1
+
+.LDTRMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b .LDTRMM_L2x2_SUB1
+
+.LDTRMM_L2x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L2x2_SAVE
+ b .LDTRMM_L2x2_SUB2
+
+.LDTRMM_L2x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L2x2_SAVE
+
+.LDTRMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x2_SUB2
+
+.LDTRMM_L2x2_SAVE:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x2_END:
+
+.LDTRMM_L2x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LDTRMM_L2x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L2x1_SUB4
+
+.LDTRMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L2x1_LOOP_END
+
+ .align 5
+
+.LDTRMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x1_LOOP
+
+.LDTRMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b .LDTRMM_L2x1_SUB1
+
+.LDTRMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b .LDTRMM_L2x1_SUB1
+
+.LDTRMM_L2x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L2x1_SAVE
+ b .LDTRMM_L2x1_SUB2
+
+.LDTRMM_L2x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L2x1_SAVE
+
+.LDTRMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L2x1_SUB2
+
+.LDTRMM_L2x1_SAVE:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in B
+#endif
+
+
+.LDTRMM_L2_END:
+.LDTRMM_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble .LDTRMM_L1_END
+ mr CO, C
+ mr AO, A
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble .LDTRMM_L1x16_END
+
+.LDTRMM_L1x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 7 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L1x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L1x16_SUB4
+
+.LDTRMM_L1x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_I1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L1x16_LOOP_END
+
+ .align 5
+
+.LDTRMM_L1x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x16_LOOP
+
+.LDTRMM_L1x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ KERNEL1x16_E2
+
+ b .LDTRMM_L1x16_SUB1
+
+.LDTRMM_L1x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL1x16_SUBI1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+
+ b .LDTRMM_L1x16_SUB1
+
+.LDTRMM_L1x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x16_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L1x16_SAVE
+ b .LDTRMM_L1x16_SUB2
+
+.LDTRMM_L1x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L1x16_SAVE
+
+.LDTRMM_L1x16_SUB2:
+
+ KERNEL1x16_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x16_SUB2
+
+.LDTRMM_L1x16_SAVE:
+
+ SAVE1x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LDTRMM_L1x16_BEGIN
+
+.LDTRMM_L1x16_END:
+
+.LDTRMM_L1x8_BEGIN:
+ andi. T2, M, 15
+ ble .LDTRMM_L1x1_END
+
+ andi. T1, M, 8
+ ble .LDTRMM_L1x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L1x8_SUB4
+
+.LDTRMM_L1x8_LOOP_START:
+
+ LOAD1x8_1
+ KERNEL1x8_I1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L1x8_LOOP_END
+
+ .align 5
+
+.LDTRMM_L1x8_LOOP:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x8_LOOP
+
+.LDTRMM_L1x8_LOOP_END:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b .LDTRMM_L1x8_SUB1
+
+.LDTRMM_L1x8_SUB4:
+
+ KERNEL1x8_SUBI1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b .LDTRMM_L1x8_SUB1
+
+.LDTRMM_L1x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L1x8_SAVE
+ b .LDTRMM_L1x8_SUB2
+
+.LDTRMM_L1x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L1x8_SAVE
+
+.LDTRMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x8_SUB2
+
+.LDTRMM_L1x8_SAVE:
+
+ SAVE1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x8_END:
+
+.LDTRMM_L1x4_BEGIN:
+
+ andi. T1, M, 4
+ ble .LDTRMM_L1x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L1x4_SUB4
+
+.LDTRMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L1x4_LOOP_END
+
+ .align 5
+
+.LDTRMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x4_LOOP
+
+.LDTRMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b .LDTRMM_L1x4_SUB1
+
+.LDTRMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b .LDTRMM_L1x4_SUB1
+
+.LDTRMM_L1x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L1x4_SAVE
+ b .LDTRMM_L1x4_SUB2
+
+.LDTRMM_L1x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L1x4_SAVE
+
+.LDTRMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x4_SUB2
+
+.LDTRMM_L1x4_SAVE:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x4_END:
+
+.LDTRMM_L1x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LDTRMM_L1x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L1x2_SUB4
+
+.LDTRMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L1x2_LOOP_END
+
+ .align 5
+
+.LDTRMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x2_LOOP
+
+.LDTRMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b .LDTRMM_L1x2_SUB1
+
+.LDTRMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b .LDTRMM_L1x2_SUB1
+
+.LDTRMM_L1x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L1x2_SAVE
+ b .LDTRMM_L1x2_SUB2
+
+.LDTRMM_L1x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L1x2_SAVE
+
+.LDTRMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x2_SUB2
+
+.LDTRMM_L1x2_SAVE:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x2_END:
+
+.LDTRMM_L1x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LDTRMM_L1x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LDTRMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LDTRMM_L1x1_SUB4
+
+.LDTRMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble .LDTRMM_L1x1_LOOP_END
+
+ .align 5
+
+.LDTRMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x1_LOOP
+
+.LDTRMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b .LDTRMM_L1x1_SUB1
+
+.LDTRMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b .LDTRMM_L1x1_SUB1
+
+.LDTRMM_L1x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble .LDTRMM_L1x1_SAVE
+ b .LDTRMM_L1x1_SUB2
+
+.LDTRMM_L1x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LDTRMM_L1x1_SAVE
+
+.LDTRMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt .LDTRMM_L1x1_SUB2
+
+.LDTRMM_L1x1_SAVE:
+
+ SAVE1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x1_END:
+
+#if !defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in B
+#endif
+
+
+.LDTRMM_L1_END:
diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S
index a4dcc49c1..c6e69b4fc 100644
--- a/kernel/power/gemm_ncopy_4.S
+++ b/kernel/power/gemm_ncopy_4.S
@@ -107,6 +107,11 @@
#ifdef PPCG4
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 72
+#endif
+
+#ifdef POWER8
+#define PREFETCHSIZE 16
+#define PREFETCHWSIZE 72
#endif
PROLOGUE
@@ -193,7 +198,7 @@ LL(12):
STFD c12, 14 * SIZE(B)
STFD c16, 15 * SIZE(B)
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
dcbtst PREA, AO1
dcbtst PREA, AO2
dcbtst PREA, AO3
diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S
index 1b6af4801..30513447e 100644
--- a/kernel/power/gemm_tcopy_4.S
+++ b/kernel/power/gemm_tcopy_4.S
@@ -111,6 +111,11 @@
#ifdef PPCG4
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 48
+#endif
+
+#ifdef POWER8
+#define PREFETCHSIZE 16
+#define PREFETCHWSIZE 48
#endif
PROLOGUE
@@ -224,7 +229,7 @@ LL(12):
STFD c15, 14 * SIZE(B1)
STFD c16, 15 * SIZE(B1)
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
dcbtst PREA, AO1
dcbtst PREA, AO2
dcbtst PREA, AO3
diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S
index 77587ecb1..02160bd61 100644
--- a/kernel/power/gemv_n.S
+++ b/kernel/power/gemv_n.S
@@ -174,6 +174,12 @@
#define PREFETCHSIZE_C 40
#endif
+#ifdef POWER8
+#define PREFETCHSIZE_A 96
+#define PREFETCHSIZE_C 40
+#endif
+
+
#ifndef NEEDPARAM
#ifndef __64BIT__
diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S
index 817a60b86..457753065 100644
--- a/kernel/power/gemv_t.S
+++ b/kernel/power/gemv_t.S
@@ -139,6 +139,11 @@
#define PREFETCHSIZE_C 8
#endif
+#ifdef POWER8
+#define PREFETCHSIZE_A 96
+#define PREFETCHSIZE_C 8
+#endif
+
#define y01 f0
#define y02 f1
#define y03 f2
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
new file mode 100644
index 000000000..a7665f749
--- /dev/null
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -0,0 +1,367 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO 312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO 240(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r10
+#define B r6
+#define C r7
+#define LDC r8
+#define OFFSET r9
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0 0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define L r15
+#define ALPHA r16
+#define o24 r17
+#define T2 r19
+#define KK r20
+#define o8 r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o16 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T1 r31
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+#endif
+
+ stfd f1, ALPHA_R_SP
+ stfd f2, ALPHA_I_SP
+ stw r0, FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz B, FRAMESLOT(0) + STACKSIZE(SP)
+ lwz C, FRAMESLOT(1) + STACKSIZE(SP)
+ lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
+#else
+ lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+ cmpwi cr0, M, 0
+ ble .L999
+ cmpwi cr0, N, 0
+ ble .L999
+ cmpwi cr0, K, 0
+ ble .L999
+
+ slwi LDC, LDC, ZBASE_SHIFT
+ li PRE, 256
+ li o8 , 8
+ li o16 , 16
+ li o24 , 24
+ li o32 , 32
+ li o48 , 48
+
+#ifdef __64BIT__
+ addi ALPHA, SP, 296
+#else
+ addi ALPHA, SP, 224
+#endif
+
+ lxvdsx alpha_r, 0, ALPHA
+ lxvdsx alpha_i, o8, ALPHA
+
+ .align 5
+
+#include "zgemm_logic_8x2_power8.S"
+
+.L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
new file mode 100644
index 000000000..5fcade5bf
--- /dev/null
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -0,0 +1,901 @@
+ srawi. J, N, 1
+ ble .LZGEMM_L2_END
+
+.LZGEMM_L2_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+ srawi. I, M, 3
+ ble .LZGEMM_L2x8_END
+
+.LZGEMM_L2x8_BEGIN:
+
+
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L2x8_SUB4
+
+.LZGEMM_L2x8_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_I1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L2x8_LOOP_END
+
+ .align 5
+
+.LZGEMM_L2x8_LOOP:
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x8_LOOP
+
+.LZGEMM_L2x8_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b .LZGEMM_L2x8_SUB1
+
+.LZGEMM_L2x8_SUB4:
+
+ dcbt AO, PRE
+ KERNEL2x8_SUBI1
+ dcbt AO, PRE
+ KERNEL2x8_SUB1
+ dcbt AO, PRE
+ KERNEL2x8_SUB1
+ dcbt AO, PRE
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b .LZGEMM_L2x8_SUB1
+
+.LZGEMM_L2x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L2x8_SAVE
+ b .LZGEMM_L2x8_SUB2
+
+.LZGEMM_L2x8_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L2x8_SAVE
+
+.LZGEMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x8_SUB2
+
+.LZGEMM_L2x8_SAVE:
+
+ SAVE2x8
+
+ addic. I, I, -1
+ bgt .LZGEMM_L2x8_BEGIN
+
+.LZGEMM_L2x8_END:
+
+.LZGEMM_L2x4_BEGIN:
+
+ andi. T2, M, 7
+ ble .LZGEMM_L2x1_END
+
+ andi. T1, M, 4
+ ble .LZGEMM_L2x4_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L2x4_SUB4
+
+.LZGEMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L2x4_LOOP_END
+
+ .align 5
+
+.LZGEMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x4_LOOP
+
+.LZGEMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b .LZGEMM_L2x4_SUB1
+
+.LZGEMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b .LZGEMM_L2x4_SUB1
+
+.LZGEMM_L2x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L2x4_SAVE
+ b .LZGEMM_L2x4_SUB2
+
+.LZGEMM_L2x4_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L2x4_SAVE
+
+.LZGEMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x4_SUB2
+
+.LZGEMM_L2x4_SAVE:
+
+ SAVE2x4
+
+.LZGEMM_L2x4_END:
+
+.LZGEMM_L2x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble .LZGEMM_L2x2_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L2x2_SUB4
+
+.LZGEMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L2x2_LOOP_END
+
+ .align 5
+
+.LZGEMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x2_LOOP
+
+.LZGEMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b .LZGEMM_L2x2_SUB1
+
+.LZGEMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b .LZGEMM_L2x2_SUB1
+
+.LZGEMM_L2x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L2x2_SAVE
+ b .LZGEMM_L2x2_SUB2
+
+.LZGEMM_L2x2_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L2x2_SAVE
+
+.LZGEMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x2_SUB2
+
+.LZGEMM_L2x2_SAVE:
+
+ SAVE2x2
+
+.LZGEMM_L2x2_END:
+
+.LZGEMM_L2x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble .LZGEMM_L2x1_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L2x1_SUB4
+
+.LZGEMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L2x1_LOOP_END
+
+ .align 5
+
+.LZGEMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x1_LOOP
+
+.LZGEMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b .LZGEMM_L2x1_SUB1
+
+.LZGEMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b .LZGEMM_L2x1_SUB1
+
+.LZGEMM_L2x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L2x1_SAVE
+ b .LZGEMM_L2x1_SUB2
+
+.LZGEMM_L2x1_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L2x1_SAVE
+
+.LZGEMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L2x1_SUB2
+
+.LZGEMM_L2x1_SAVE:
+
+ SAVE2x1
+
+.LZGEMM_L2x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+ addic. J, J, -1
+ bgt .LZGEMM_L2_BEGIN
+
+ andi. T2, N, 1
+ ble .L999
+
+.LZGEMM_L2_END:
+
+ b .LZGEMM_L1_BEGIN
+
+.L999_H1:
+
+ b .L999
+
+.LZGEMM_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble .LZGEMM_L1_END
+ mr CO, C
+ mr AO, A
+ srawi. I, M, 3
+ ble .LZGEMM_L1x8_END
+
+.LZGEMM_L1x8_BEGIN:
+
+
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L1x8_SUB4
+
+.LZGEMM_L1x8_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_I1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L1x8_LOOP_END
+
+ .align 5
+
+.LZGEMM_L1x8_LOOP:
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x8_LOOP
+
+.LZGEMM_L1x8_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b .LZGEMM_L1x8_SUB1
+
+.LZGEMM_L1x8_SUB4:
+
+ dcbt AO, PRE
+ KERNEL1x8_SUBI1
+ dcbt AO, PRE
+ KERNEL1x8_SUB1
+ dcbt AO, PRE
+ KERNEL1x8_SUB1
+ dcbt AO, PRE
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b .LZGEMM_L1x8_SUB1
+
+.LZGEMM_L1x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L1x8_SAVE
+ b .LZGEMM_L1x8_SUB2
+
+.LZGEMM_L1x8_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L1x8_SAVE
+
+.LZGEMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x8_SUB2
+
+.LZGEMM_L1x8_SAVE:
+
+ SAVE1x8
+
+ addic. I, I, -1
+ bgt .LZGEMM_L1x8_BEGIN
+
+.LZGEMM_L1x8_END:
+
+.LZGEMM_L1x4_BEGIN:
+
+ andi. T2, M, 7
+ ble .LZGEMM_L1x1_END
+
+ andi. T1, M, 4
+ ble .LZGEMM_L1x4_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L1x4_SUB4
+
+.LZGEMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L1x4_LOOP_END
+
+ .align 5
+
+.LZGEMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x4_LOOP
+
+.LZGEMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b .LZGEMM_L1x4_SUB1
+
+.LZGEMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b .LZGEMM_L1x4_SUB1
+
+.LZGEMM_L1x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L1x4_SAVE
+ b .LZGEMM_L1x4_SUB2
+
+.LZGEMM_L1x4_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L1x4_SAVE
+
+.LZGEMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x4_SUB2
+
+.LZGEMM_L1x4_SAVE:
+
+ SAVE1x4
+
+.LZGEMM_L1x4_END:
+
+.LZGEMM_L1x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble .LZGEMM_L1x2_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L1x2_SUB4
+
+.LZGEMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L1x2_LOOP_END
+
+ .align 5
+
+.LZGEMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x2_LOOP
+
+.LZGEMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b .LZGEMM_L1x2_SUB1
+
+.LZGEMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b .LZGEMM_L1x2_SUB1
+
+.LZGEMM_L1x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L1x2_SAVE
+ b .LZGEMM_L1x2_SUB2
+
+.LZGEMM_L1x2_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L1x2_SAVE
+
+.LZGEMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x2_SUB2
+
+.LZGEMM_L1x2_SAVE:
+
+ SAVE1x2
+
+.LZGEMM_L1x2_END:
+
+.LZGEMM_L1x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble .LZGEMM_L1x1_END
+ mr BO, B
+ srawi. L, K, 3
+ ble .LZGEMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LZGEMM_L1x1_SUB4
+
+.LZGEMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble .LZGEMM_L1x1_LOOP_END
+
+ .align 5
+
+.LZGEMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x1_LOOP
+
+.LZGEMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b .LZGEMM_L1x1_SUB1
+
+.LZGEMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b .LZGEMM_L1x1_SUB1
+
+.LZGEMM_L1x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble .LZGEMM_L1x1_SAVE
+ b .LZGEMM_L1x1_SUB2
+
+.LZGEMM_L1x1_SUB1:
+
+ andi. L, K, 7
+ ble .LZGEMM_L1x1_SAVE
+
+.LZGEMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt .LZGEMM_L1x1_SUB2
+
+.LZGEMM_L1x1_SAVE:
+
+ SAVE1x1
+
+.LZGEMM_L1x1_END:
+
+.LZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
new file mode 100644
index 000000000..701ec65c8
--- /dev/null
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -0,0 +1,3110 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xsadddp
+
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xsadddp
+
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xssubdp
+
+#else // CC || CR || RC || RR
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmuldp vs48, vs0, vs18 // real*real, imag*real
+ xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs50, vs1, vs18 // real*real, imag*real
+ xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs52, vs2, vs18 // real*real, imag*real
+ xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs54, vs3, vs18 // real*real, imag*real
+ xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmuldp vs56, vs4, vs18 // real*real, imag*real
+ xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmuldp vs58, vs5, vs18 // real*real, imag*real
+ xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmuldp vs60, vs6, vs18 // real*real, imag*real
+ xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmuldp vs62, vs7, vs18 // real*real, imag*real
+ xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+
+ addi AO, AO, 64
+
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
+
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs50, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+
+ xvmaddadp vs52, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs54, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
+
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ xvmaddadp vs56, vs12, vs22 // real*real, imag*real
+ xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
+ xvmaddadp vs58, vs13, vs22 // real*real, imag*real
+ xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
+
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ xvmaddadp vs60, vs14, vs22 // real*real, imag*real
+ xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
+ xvmaddadp vs62, vs15, vs22 // real*real, imag*real
+ xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs50, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs52, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs54, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
+ xvmaddadp vs56, vs12, vs22 // real*real, imag*real
+ xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
+ xvmaddadp vs58, vs13, vs22 // real*real, imag*real
+ xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
+ xvmaddadp vs60, vs14, vs22 // real*real, imag*real
+ xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
+ xvmaddadp vs62, vs15, vs22 // real*real, imag*real
+ xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmuldp vs48, vs0, vs18 // real*real, imag*real
+ xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs50, vs1, vs18 // real*real, imag*real
+ xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs52, vs2, vs18 // real*real, imag*real
+ xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs54, vs3, vs18 // real*real, imag*real
+ xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmuldp vs56, vs4, vs18 // real*real, imag*real
+ xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmuldp vs58, vs5, vs18 // real*real, imag*real
+ xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmuldp vs60, vs6, vs18 // real*real, imag*real
+ xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmuldp vs62, vs7, vs18 // real*real, imag*real
+ xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs48 // realA*realB
+ XSFADD_R2 vs0, vs0, vs49 // imagA*imagB
+
+ xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs48 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs49 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs50 // realA*realB
+ XSFADD_R2 vs0, vs0, vs51 // imagA*imagB
+
+ xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs50 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs51 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs52 // realA*realB
+ XSFADD_R2 vs0, vs0, vs53 // imagA*imagB
+
+ xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs52 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs53 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs54 // realA*realB
+ XSFADD_R2 vs0, vs0, vs55 // imagA*imagB
+
+ xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs54 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs55 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs56 // realA*realB
+ XSFADD_R2 vs0, vs0, vs57 // imagA*imagB
+
+ xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs56 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs57 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs58 // realA*realB
+ XSFADD_R2 vs0, vs0, vs59 // imagA*imagB
+
+ xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs58 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs59 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs60 // realA*realB
+ XSFADD_R2 vs0, vs0, vs61 // imagA*imagB
+
+ xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs60 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs61 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs62 // realA*realB
+ XSFADD_R2 vs0, vs0, vs63 // imagA*imagB
+
+ xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs62 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs63 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+ addi CO, CO, 128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmuldp vs40, vs0, vs18 // real*real, imag*real
+ xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs42, vs1, vs18 // real*real, imag*real
+ xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs44, vs2, vs18 // real*real, imag*real
+ xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs46, vs3, vs18 // real*real, imag*real
+ xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmuldp vs40, vs0, vs18 // real*real, imag*real
+ xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs42, vs1, vs18 // real*real, imag*real
+ xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs44, vs2, vs18 // real*real, imag*real
+ xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs46, vs3, vs18 // real*real, imag*real
+ xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmuldp vs36, vs0, vs18 // real*real, imag*real
+ xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs38, vs1, vs18 // real*real, imag*real
+ xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs38, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs38, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmuldp vs36, vs0, vs18 // real*real, imag*real
+ xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs38, vs1, vs18 // real*real, imag*real
+ xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmuldp vs34, vs0, vs18 // real*real, imag*real
+ xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmuldp vs34, vs0, vs18 // real*real, imag*real
+ xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+ addi CO, CO, 128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 16
+
+.endm
+
diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S
index 23e0177c0..f93439986 100644
--- a/kernel/power/zgemv_n.S
+++ b/kernel/power/zgemv_n.S
@@ -170,6 +170,11 @@
#define PREFETCHSIZE_C 24
#endif
+#ifdef POWER8
+#define PREFETCHSIZE_A 24
+#define PREFETCHSIZE_C 24
+#endif
+
#ifndef XCONJ
#define FMADDR FMADD
#define FMSUBR FNMSUB
diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S
index c0bad3152..9c6f510c2 100644
--- a/kernel/power/zgemv_t.S
+++ b/kernel/power/zgemv_t.S
@@ -144,6 +144,12 @@
#define PREFETCHSIZE_C 8
#endif
+#ifdef POWER8
+#define PREFETCHSIZE_A 24
+#define PREFETCHSIZE_C 8
+#endif
+
+
#if !(defined(CONJ) && defined(XCONJ))
#define FMADDR FMADD
#define FMSUBR FNMSUB
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
new file mode 100644
index 000000000..8b953765e
--- /dev/null
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -0,0 +1,377 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO 312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO 240(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r10
+#define B r6
+#define C r7
+#define LDC r8
+#define OFFSET r9
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0 0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define KKK r13
+#define K1 r14
+#define L r15
+#define ALPHA r16
+#define o24 r17
+#define T2 r19
+#define KK r20
+#define o8 r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o16 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T1 r31
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+ std r13, 288(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+ stw r14, 212(SP)
+ stw r13, 216(SP)
+#endif
+
+ stfd f1, ALPHA_R_SP
+ stfd f2, ALPHA_I_SP
+ stw r0, FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz B, FRAMESLOT(0) + STACKSIZE(SP)
+ lwz C, FRAMESLOT(1) + STACKSIZE(SP)
+ lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
+#else
+ lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+ cmpwi cr0, M, 0
+ ble .L999
+ cmpwi cr0, N, 0
+ ble .L999
+ cmpwi cr0, K, 0
+ ble .L999
+
+ slwi LDC, LDC, ZBASE_SHIFT
+ li PRE, 256
+ li o8 , 8
+ li o16 , 16
+ li o24 , 24
+ li o32 , 32
+ li o48 , 48
+
+#ifdef __64BIT__
+ addi ALPHA, SP, 296
+#else
+ addi ALPHA, SP, 224
+#endif
+
+ lxsdx alpha_r, 0, ALPHA
+ lxsdx alpha_i, o8, ALPHA
+
+ .align 4
+
+#include "ztrmm_logic_8x2_power8.S"
+
+.L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+ ld r13, 288(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+ lwz r14, 212(SP)
+ lwz r13, 216(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S
new file mode 100644
index 000000000..f422b17b1
--- /dev/null
+++ b/kernel/power/ztrmm_logic_8x2_power8.S
@@ -0,0 +1,1237 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. J, N, 1
+ ble .LZTRMM_L2_END
+
+.LZTRMM_L2_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 3
+ ble .LZTRMM_L2x8_END
+
+.LZTRMM_L2x8_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 7 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L2x8_SUB4
+
+.LZTRMM_L2x8_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_I1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L2x8_LOOP_END
+
+ .align 5
+
+.LZTRMM_L2x8_LOOP:
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x8_LOOP
+
+.LZTRMM_L2x8_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ dcbt AO, PRE
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ dcbt AO, PRE
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b .LZTRMM_L2x8_SUB1
+
+.LZTRMM_L2x8_SUB4:
+
+ dcbt AO, PRE
+ KERNEL2x8_SUBI1
+ dcbt AO, PRE
+ KERNEL2x8_SUB1
+ dcbt AO, PRE
+ KERNEL2x8_SUB1
+ dcbt AO, PRE
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b .LZTRMM_L2x8_SUB1
+
+.LZTRMM_L2x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L2x8_SAVE
+ b .LZTRMM_L2x8_SUB2
+
+.LZTRMM_L2x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L2x8_SAVE
+
+.LZTRMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x8_SUB2
+
+.LZTRMM_L2x8_SAVE:
+
+ SAVE2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LZTRMM_L2x8_BEGIN
+
+.LZTRMM_L2x8_END:
+
+.LZTRMM_L2x4_BEGIN:
+ andi. T2, M, 7
+ ble .LZTRMM_L2x1_END
+
+ andi. T1, M, 4
+ ble .LZTRMM_L2x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L2x4_SUB4
+
+.LZTRMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L2x4_LOOP_END
+
+ .align 5
+
+.LZTRMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x4_LOOP
+
+.LZTRMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b .LZTRMM_L2x4_SUB1
+
+.LZTRMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b .LZTRMM_L2x4_SUB1
+
+.LZTRMM_L2x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L2x4_SAVE
+ b .LZTRMM_L2x4_SUB2
+
+.LZTRMM_L2x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L2x4_SAVE
+
+.LZTRMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x4_SUB2
+
+.LZTRMM_L2x4_SAVE:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LZTRMM_L2x4_END:
+
+.LZTRMM_L2x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LZTRMM_L2x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L2x2_SUB4
+
+.LZTRMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L2x2_LOOP_END
+
+ .align 5
+
+.LZTRMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x2_LOOP
+
+.LZTRMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b .LZTRMM_L2x2_SUB1
+
+.LZTRMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b .LZTRMM_L2x2_SUB1
+
+.LZTRMM_L2x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L2x2_SAVE
+ b .LZTRMM_L2x2_SUB2
+
+.LZTRMM_L2x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L2x2_SAVE
+
+.LZTRMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x2_SUB2
+
+.LZTRMM_L2x2_SAVE:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LZTRMM_L2x2_END:
+
+.LZTRMM_L2x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LZTRMM_L2x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L2x1_SUB4
+
+.LZTRMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L2x1_LOOP_END
+
+ .align 5
+
+.LZTRMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x1_LOOP
+
+.LZTRMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b .LZTRMM_L2x1_SUB1
+
+.LZTRMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b .LZTRMM_L2x1_SUB1
+
+.LZTRMM_L2x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L2x1_SAVE
+ b .LZTRMM_L2x1_SUB2
+
+.LZTRMM_L2x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L2x1_SAVE
+
+.LZTRMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L2x1_SUB2
+
+.LZTRMM_L2x1_SAVE:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LZTRMM_L2x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in B
+#endif
+
+
+ addic. J, J, -1
+ bgt .LZTRMM_L2_BEGIN
+
+ andi. T2, N, 1
+ ble .L999
+
+.LZTRMM_L2_END:
+
+ b .LZTRMM_L1_BEGIN
+
+.L999_H1:
+
+ b .L999
+
+.LZTRMM_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble .LZTRMM_L1_END
+ mr CO, C
+ mr AO, A
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 3
+ ble .LZTRMM_L1x8_END
+
+.LZTRMM_L1x8_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 7 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L1x8_SUB4
+
+.LZTRMM_L1x8_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_I1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L1x8_LOOP_END
+
+ .align 5
+
+.LZTRMM_L1x8_LOOP:
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x8_LOOP
+
+.LZTRMM_L1x8_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ dcbt AO, PRE
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ dcbt AO, PRE
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b .LZTRMM_L1x8_SUB1
+
+.LZTRMM_L1x8_SUB4:
+
+ dcbt AO, PRE
+ KERNEL1x8_SUBI1
+ dcbt AO, PRE
+ KERNEL1x8_SUB1
+ dcbt AO, PRE
+ KERNEL1x8_SUB1
+ dcbt AO, PRE
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b .LZTRMM_L1x8_SUB1
+
+.LZTRMM_L1x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L1x8_SAVE
+ b .LZTRMM_L1x8_SUB2
+
+.LZTRMM_L1x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L1x8_SAVE
+
+.LZTRMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x8_SUB2
+
+.LZTRMM_L1x8_SAVE:
+
+ SAVE1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LZTRMM_L1x8_BEGIN
+
+.LZTRMM_L1x8_END:
+
+.LZTRMM_L1x4_BEGIN:
+ andi. T2, M, 7
+ ble .LZTRMM_L1x1_END
+
+ andi. T1, M, 4
+ ble .LZTRMM_L1x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L1x4_SUB4
+
+.LZTRMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L1x4_LOOP_END
+
+ .align 5
+
+.LZTRMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x4_LOOP
+
+.LZTRMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b .LZTRMM_L1x4_SUB1
+
+.LZTRMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b .LZTRMM_L1x4_SUB1
+
+.LZTRMM_L1x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L1x4_SAVE
+ b .LZTRMM_L1x4_SUB2
+
+.LZTRMM_L1x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L1x4_SAVE
+
+.LZTRMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x4_SUB2
+
+.LZTRMM_L1x4_SAVE:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LZTRMM_L1x4_END:
+
+.LZTRMM_L1x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LZTRMM_L1x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L1x2_SUB4
+
+.LZTRMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L1x2_LOOP_END
+
+ .align 5
+
+.LZTRMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x2_LOOP
+
+.LZTRMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b .LZTRMM_L1x2_SUB1
+
+.LZTRMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b .LZTRMM_L1x2_SUB1
+
+.LZTRMM_L1x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L1x2_SAVE
+ b .LZTRMM_L1x2_SUB2
+
+.LZTRMM_L1x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L1x2_SAVE
+
+.LZTRMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x2_SUB2
+
+.LZTRMM_L1x2_SAVE:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LZTRMM_L1x2_END:
+
+.LZTRMM_L1x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LZTRMM_L1x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LZTRMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LZTRMM_L1x1_SUB4
+
+.LZTRMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble .LZTRMM_L1x1_LOOP_END
+
+ .align 5
+
+.LZTRMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x1_LOOP
+
+.LZTRMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b .LZTRMM_L1x1_SUB1
+
+.LZTRMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b .LZTRMM_L1x1_SUB1
+
+.LZTRMM_L1x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble .LZTRMM_L1x1_SAVE
+ b .LZTRMM_L1x1_SUB2
+
+.LZTRMM_L1x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LZTRMM_L1x1_SAVE
+
+.LZTRMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt .LZTRMM_L1x1_SUB2
+
+.LZTRMM_L1x1_SAVE:
+
+ SAVE1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LZTRMM_L1x1_END:
+
+#if !defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in B
+#endif
+
+
+.LZTRMM_L1_END:
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 2dcc8658b..4874711bb 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -389,19 +389,19 @@ DGEMVTKERNEL = dgemv_t.S
endif
ifndef CGEMVNKERNEL
-CGEMVNKERNEL = cgemv_n.S
+CGEMVNKERNEL = cgemv_n_4.c
endif
ifndef CGEMVTKERNEL
-CGEMVTKERNEL = cgemv_t.S
+CGEMVTKERNEL = cgemv_t_4.c
endif
ifndef ZGEMVNKERNEL
-ZGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n_4.c
endif
ifndef ZGEMVTKERNEL
-ZGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t_4.c
endif
ifndef QGEMVNKERNEL
diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA
index 313c62d7c..70f3d6058 100644
--- a/kernel/x86_64/KERNEL.BARCELONA
+++ b/kernel/x86_64/KERNEL.BARCELONA
@@ -1,6 +1,3 @@
-ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t.S
-
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index c8ccae1ea..90834d9ca 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
-ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 6c726a6e9..3ad142063 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -11,7 +11,7 @@ ZAXPYKERNEL = zaxpy.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
-ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S
diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER
index 5291cc624..f14c82303 100644
--- a/kernel/x86_64/KERNEL.STEAMROLLER
+++ b/kernel/x86_64/KERNEL.STEAMROLLER
@@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c
-ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVNKERNEL = zgemv_t_4.c
ZGEMVTKERNEL = zgemv_t_4.c
DCOPYKERNEL = dcopy_bulldozer.S
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index ff8058549..d60e4475d 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HASWELL)
#include "cgemv_n_microk_haswell-4.c"
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#include "cgemv_n_microk_bulldozer-4.c"
#endif
diff --git a/kernel/x86_64/cgemv_n_microk_bulldozer-4.c b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c
new file mode 100644
index 000000000..a74b41269
--- /dev/null
+++ b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c
@@ -0,0 +1,541 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+
+ "vbroadcastss (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
+ "vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2
+ "vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2
+ "vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3
+ "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
+
+ "cmpq $0 , %1 \n\t"
+ "je 2f \n\t"
+
+ ".align 16 \n\t"
+ "1: \n\t"
+ "prefetcht0 384(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
+
+ "prefetcht0 384(%5,%0,4) \n\t"
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+ "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 384(%6,%0,4) \n\t"
+ "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
+ "vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2
+
+ "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 384(%7,%0,4) \n\t"
+ "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
+ "vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3
+
+ "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmaddps %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddps %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmaddps %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddps %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 384(%3,%0,4) \n\t"
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,4), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,4) \n\t"
+
+ "addq $16, %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz 1b \n\t"
+
+ "2: \n\t"
+
+ "cmpq $4, %8 \n\t"
+ "jne 3f \n\t"
+
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
+ "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
+
+ "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+
+ "3: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (n2) // 8
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x2 1
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastss (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
+
+ "cmpq $0 , %1 \n\t"
+ "je 2f \n\t"
+
+ // ".align 16 \n\t"
+ "1: \n\t"
+ "prefetcht0 384(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
+
+ "prefetcht0 384(%5,%0,4) \n\t"
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+ "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 384(%3,%0,4) \n\t"
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,4), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,4) \n\t"
+
+ "addq $16, %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz 1b \n\t"
+
+ "2: \n\t"
+
+ "cmpq $4, %6 \n\t"
+ "jne 3f \n\t"
+
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+
+ "3: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (n2) // 6
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x1 1
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastss (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
+
+ "cmpq $0 , %1 \n\t"
+ "je 2f \n\t"
+
+ // ".align 16 \n\t"
+ "1: \n\t"
+ "prefetcht0 384(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 384(%3,%0,4) \n\t"
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,4), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "addq $16, %0 \n\t"
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "subq $8 , %1 \n\t"
+ "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
+ "vmovups %%ymm13,-32(%3,%0,4) \n\t"
+
+ "jnz 1b \n\t"
+
+ "2: \n\t"
+
+ "cmpq $4, %5 \n\t"
+ "jne 3f \n\t"
+
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+
+ "3: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap), // 4
+ "r" (n2) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_ADDY 1
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+ BLASLONG i;
+
+ if ( inc_dest != 2 )
+ {
+
+ FLOAT temp_r;
+ FLOAT temp_i;
+ for ( i=0; i 384 )
+ {
+
+ __asm__ __volatile__
+ (
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
+ "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
+ "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
+ "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
+ "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
+
+
+ ".align 16 \n\t"
+ "1: \n\t"
+ "prefetcht0 512(%4,%0,8) \n\t"
+
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
+ "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
+
+ "prefetcht0 512(%5,%0,8) \n\t"
+
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "prefetcht0 512(%6,%0,8) \n\t"
+
+ "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
+ "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2
+
+ "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3
+ "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3
+
+ "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 512(%7,%0,8) \n\t"
+
+ "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]) // 7
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+ }
+ else
+ {
+
+ __asm__ __volatile__
+ (
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
+ "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
+ "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
+ "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
+ "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
+
+
+ ".align 16 \n\t"
+ "1: \n\t"
+
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
+ "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
+
+
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+
+ "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
+ "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2
+
+ "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3
+ "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3
+
+ "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+
+ "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]) // 7
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+
+
+ }
+
+
+}
+
+#define HAVE_KERNEL_4x2 1
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
+
+
+ // ".align 16 \n\t"
+ "1: \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
+ "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz 1b \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x1 1
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+
+ // ".align 16 \n\t"
+ "1: \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz 1b \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap) // 4
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+
+#define HAVE_KERNEL_ADDY 1
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+ BLASLONG i;
+
+ if ( inc_dest != 2 )
+ {
+
+ FLOAT temp_r;
+ FLOAT temp_i;
+ for ( i=0; i 0 from CHSEQR, then quit
+* If INFO .NE. 0 from CHSEQR, then quit
*
- IF( INFO.GT.0 )
+ IF( INFO.NE.0 )
$ GO TO 50
*
IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/cgetc2.f b/lapack-netlib/SRC/cgetc2.f
index fac6b56820..99eb69d92 100644
--- a/lapack-netlib/SRC/cgetc2.f
+++ b/lapack-netlib/SRC/cgetc2.f
@@ -145,15 +145,33 @@
INTRINSIC ABS, CMPLX, MAX
* ..
* .. Executable Statements ..
+*
+ INFO = 0
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
*
* Set constants to control overflow
*
- INFO = 0
EPS = SLAMCH( 'P' )
SMLNUM = SLAMCH( 'S' ) / EPS
BIGNUM = ONE / SMLNUM
CALL SLABAD( SMLNUM, BIGNUM )
*
+* Handle the case N=1 by itself
+*
+ IF( N.EQ.1 ) THEN
+ IPIV( 1 ) = 1
+ JPIV( 1 ) = 1
+ IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+ INFO = 1
+ A( 1, 1 ) = CMPLX( SMLNUM, ZERO )
+ END IF
+ RETURN
+ END IF
+*
* Factorize A using complete pivoting.
* Set pivots less than SMIN to SMIN
*
diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f
index 4a000fe10..decdae509 100644
--- a/lapack-netlib/SRC/cggev3.f
+++ b/lapack-netlib/SRC/cggev3.f
@@ -339,16 +339,16 @@
$ LDVL, VR, LDVR, WORK, -1, IERR )
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
CALL CHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
- $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK,
- $ -1, WORK, IERR )
+ $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
+ $ RWORK, IERR )
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
ELSE
CALL CGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL,
$ VR, LDVR, WORK, -1, IERR )
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
CALL CHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
- $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK,
- $ -1, WORK, IERR )
+ $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
+ $ RWORK, IERR )
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
END IF
WORK( 1 ) = CMPLX( LWKOPT )
diff --git a/lapack-netlib/SRC/dgeev.f b/lapack-netlib/SRC/dgeev.f
index dd60db69e..328eaa39c 100644
--- a/lapack-netlib/SRC/dgeev.f
+++ b/lapack-netlib/SRC/dgeev.f
@@ -418,9 +418,9 @@
$ WORK( IWRK ), LWORK-IWRK+1, INFO )
END IF
*
-* If INFO > 0 from DHSEQR, then quit
+* If INFO .NE. 0 from DHSEQR, then quit
*
- IF( INFO.GT.0 )
+ IF( INFO.NE.0 )
$ GO TO 50
*
IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/dgetc2.f b/lapack-netlib/SRC/dgetc2.f
index 7e43a0236..3cd7eeb2b 100644
--- a/lapack-netlib/SRC/dgetc2.f
+++ b/lapack-netlib/SRC/dgetc2.f
@@ -145,15 +145,33 @@
INTRINSIC ABS, MAX
* ..
* .. Executable Statements ..
+*
+ INFO = 0
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
*
* Set constants to control overflow
*
- INFO = 0
EPS = DLAMCH( 'P' )
SMLNUM = DLAMCH( 'S' ) / EPS
BIGNUM = ONE / SMLNUM
CALL DLABAD( SMLNUM, BIGNUM )
*
+* Handle the case N=1 by itself
+*
+ IF( N.EQ.1 ) THEN
+ IPIV( 1 ) = 1
+ JPIV( 1 ) = 1
+ IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+ INFO = 1
+ A( 1, 1 ) = SMLNUM
+ END IF
+ RETURN
+ END IF
+*
* Factorize A using complete pivoting.
* Set pivots less than SMIN to SMIN.
*
diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f
index 89dbe08c8..667de0afe 100644
--- a/lapack-netlib/SRC/sgeev.f
+++ b/lapack-netlib/SRC/sgeev.f
@@ -418,9 +418,9 @@
$ WORK( IWRK ), LWORK-IWRK+1, INFO )
END IF
*
-* If INFO > 0 from SHSEQR, then quit
+* If INFO .NE. 0 from SHSEQR, then quit
*
- IF( INFO.GT.0 )
+ IF( INFO.NE.0 )
$ GO TO 50
*
IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/sgetc2.f b/lapack-netlib/SRC/sgetc2.f
index 3c3880d4e..598446519 100644
--- a/lapack-netlib/SRC/sgetc2.f
+++ b/lapack-netlib/SRC/sgetc2.f
@@ -145,15 +145,33 @@
INTRINSIC ABS, MAX
* ..
* .. Executable Statements ..
+*
+ INFO = 0
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
*
* Set constants to control overflow
*
- INFO = 0
EPS = SLAMCH( 'P' )
SMLNUM = SLAMCH( 'S' ) / EPS
BIGNUM = ONE / SMLNUM
CALL SLABAD( SMLNUM, BIGNUM )
*
+* Handle the case N=1 by itself
+*
+ IF( N.EQ.1 ) THEN
+ IPIV( 1 ) = 1
+ JPIV( 1 ) = 1
+ IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+ INFO = 1
+ A( 1, 1 ) = SMLNUM
+ END IF
+ RETURN
+ END IF
+*
* Factorize A using complete pivoting.
* Set pivots less than SMIN to SMIN.
*
diff --git a/lapack-netlib/SRC/zgeev.f b/lapack-netlib/SRC/zgeev.f
index d4520805f..a518b4cd9 100644
--- a/lapack-netlib/SRC/zgeev.f
+++ b/lapack-netlib/SRC/zgeev.f
@@ -404,9 +404,9 @@
$ WORK( IWRK ), LWORK-IWRK+1, INFO )
END IF
*
-* If INFO > 0 from ZHSEQR, then quit
+* If INFO .NE. 0 from ZHSEQR, then quit
*
- IF( INFO.GT.0 )
+ IF( INFO.NE.0 )
$ GO TO 50
*
IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/zgetc2.f b/lapack-netlib/SRC/zgetc2.f
index 3179612f5..bf59415b5 100644
--- a/lapack-netlib/SRC/zgetc2.f
+++ b/lapack-netlib/SRC/zgetc2.f
@@ -145,15 +145,33 @@
INTRINSIC ABS, DCMPLX, MAX
* ..
* .. Executable Statements ..
+*
+ INFO = 0
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
*
* Set constants to control overflow
*
- INFO = 0
EPS = DLAMCH( 'P' )
SMLNUM = DLAMCH( 'S' ) / EPS
BIGNUM = ONE / SMLNUM
CALL DLABAD( SMLNUM, BIGNUM )
*
+* Handle the case N=1 by itself
+*
+ IF( N.EQ.1 ) THEN
+ IPIV( 1 ) = 1
+ JPIV( 1 ) = 1
+ IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+ INFO = 1
+ A( 1, 1 ) = DCMPLX( SMLNUM, ZERO )
+ END IF
+ RETURN
+ END IF
+*
* Factorize A using complete pivoting.
* Set pivots less than SMIN to SMIN
*
diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f
index 1c4e832af..78337fd07 100644
--- a/lapack-netlib/SRC/zggev3.f
+++ b/lapack-netlib/SRC/zggev3.f
@@ -340,7 +340,7 @@
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
CALL ZHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
$ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
- $ WORK, IERR )
+ $ RWORK, IERR )
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
ELSE
CALL ZGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL,
@@ -348,7 +348,7 @@
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
CALL ZHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
$ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
- $ WORK, IERR )
+ $ RWORK, IERR )
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
END IF
WORK( 1 ) = DCMPLX( LWKOPT )
diff --git a/lapack-netlib/TESTING/nep.in b/lapack-netlib/TESTING/nep.in
index ed6869b80..af427fbde 100644
--- a/lapack-netlib/TESTING/nep.in
+++ b/lapack-netlib/TESTING/nep.in
@@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines
0 5 7 3 200 Values of INIBL (nibble crossover point)
1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts)
0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2)
-30.0 Threshold value
+40.0 Threshold value
T Put T to test the error exits
1 Code to interpret the seed
NEP 21
diff --git a/param.h b/param.h
index 962f80ef3..31125d8e4 100644
--- a/param.h
+++ b/param.h
@@ -1959,6 +1959,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
+#if defined(POWER8)
+
+#define SNUMOPT 4
+#define DNUMOPT 8
+
+#define GEMM_DEFAULT_OFFSET_A 384
+#define GEMM_DEFAULT_OFFSET_B 1024
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_N 2
+
+#define SGEMM_DEFAULT_P 992
+#define DGEMM_DEFAULT_P 480
+#define CGEMM_DEFAULT_P 488
+#define ZGEMM_DEFAULT_P 240
+
+#define SGEMM_DEFAULT_Q 504
+#define DGEMM_DEFAULT_Q 720
+#define CGEMM_DEFAULT_Q 400
+#define ZGEMM_DEFAULT_Q 360
+
+#define SGEMM_DEFAULT_R 28800
+#define DGEMM_DEFAULT_R 14400
+#define ZGEMM_DEFAULT_R 7200
+
+#define SYMV_P 8
+
+#endif
+
+
#if defined(SPARC) && defined(V7)
#define SNUMOPT 4
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 6c7788d97..dfa42df67 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -5,6 +5,13 @@ set(OpenBLAS_utest_src
test_amax.c
)
+if (NOT NO_LAPACK)
+set(OpenBLAS_utest_src
+ ${OpenBLAS_utest_src}
+ test_potrs.c
+ )
+endif()
+
set(OpenBLAS_utest_bin openblas_utest)
add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
diff --git a/utest/Makefile b/utest/Makefile
index 716b1c784..9f9808920 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -11,6 +11,10 @@ include $(TOPDIR)/Makefile.system
OBJS=utest_main.o test_amax.o
#test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_fork.o
+ifneq ($(NO_LAPACK), 1)
+OBJS += test_potrs.o
+endif
+
all : run_test
$(UTESTBIN): $(OBJS)
diff --git a/utest/ctest.h b/utest/ctest.h
index 01c50f73b..a62103ff5 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -1,4 +1,4 @@
-/* Copyright 2011-2015 Bas van den Berg
+/* Copyright 2011-2016 Bas van den Berg
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -58,6 +58,10 @@ struct ctest {
#define __CTEST_APPLE
#endif
+#ifdef __MINGW32__
+#undef CTEST_SEGFAULT
+#endif
+
#if defined(_WIN32) && defined(_MSC_VER)
#define __CTEST_MSVC
#endif
@@ -212,6 +216,9 @@ void assert_not_equal(intmax_t exp, intmax_t real, const char* caller, int line)
void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line);
#define ASSERT_NOT_EQUAL_U(exp, real) assert_not_equal_u(exp, real, __FILE__, __LINE__)
+void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line);
+#define ASSERT_INTERVAL(exp1, exp2, real) assert_interval(exp1, exp2, real, __FILE__, __LINE__)
+
void assert_null(void* real, const char* caller, int line);
#define ASSERT_NULL(real) assert_null((void*)real, __FILE__, __LINE__)
@@ -511,6 +518,12 @@ void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int l
}
}
+void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line) {
+ if (real < exp1 || real > exp2) {
+ CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real);
+ }
+}
+
void assert_dbl_near(double exp, double real, double tol, const char* caller, int line) {
double diff = exp - real;
double absdiff = diff;
diff --git a/utest/openblas_utest.h b/utest/openblas_utest.h
index fb70fdc27..abe381a92 100644
--- a/utest/openblas_utest.h
+++ b/utest/openblas_utest.h
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ctest.h"
#include
+#include
#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13
diff --git a/utest/test_potrs.c b/utest/test_potrs.c
new file mode 100644
index 000000000..41b3f6492
--- /dev/null
+++ b/utest/test_potrs.c
@@ -0,0 +1,96 @@
+/*****************************************************************************
+Copyright (c) 2011-2016, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+/*
+void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*);
+void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*,
+ BLASINT*, complex double*, BLASINT*, BLASINT*);
+*/
+
+
+//https://github.com/xianyi/OpenBLAS/issues/695
+CTEST(potrf, bug_695){
+
+ openblas_complex_float A1[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I,
+ -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I,
+ 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I,
+ 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I,
+ -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I,
+ 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I,
+ 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I,
+ 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I,
+ 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I,
+ 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I};
+ char up = 'U';
+
+ blasint n=10;
+ blasint info[1];
+ BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info);
+ //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91]));
+
+ openblas_complex_double A2[100] = {3.0607147216796875+0.0*I, -0.5905849933624268-0.29020825028419495*I, 0.321084201335907+0.45168760418891907*I, 0.8387917876243591-0.644718587398529*I, -0.3642411530017853+0.051274992525577545*I, 0.8071482181549072+0.33944568037986755*I, 0.013674172572791576+0.21422699093818665*I, 0.35476258397102356+0.42408594489097595*I, -0.5991537570953369-0.23082709312438965*I, -0.0600702166557312-0.2113417387008667*I,
+ -0.7954045534133911+0.7066076993942261*I, 2.807175397872925+0.0*I, -0.1691000759601593+0.313548743724823*I, -0.30911174416542053+0.7447023987770081*I, -0.22347848117351532+0.03316075727343559*I, -0.4088296890258789-1.0214389562606812*I, -0.2344931811094284+0.08056317269802094*I, 0.793269693851471-0.17507623136043549*I, 0.03163455054163933+0.20559945702552795*I, 0.13581633567810059-0.2110036462545395*I,
+ 0.9827471375465393+1.3824869394302368*I, -1.8076121807098389-0.8882446885108948*I, 2.3277781009674072+0.0*I, 0.830405056476593-0.19296252727508545*I, 0.1394239068031311-0.5260677933692932*I, 1.239942193031311-0.09915469586849213*I, 0.06731037050485611-0.059320636093616486*I, 0.11507681757211685-0.1984301060438156*I, -0.6843825578689575+0.4647614359855652*I, 1.213119387626648-0.7757048010826111*I,
+ 2.619997978210449+1.8532984256744385*I, 0.4780699610710144+0.48494184017181396*I, -0.18385779857635498+0.6468567848205566*I, 2.0811400413513184+0.0*I, -0.035075582563877106+0.09732913225889206*I, 0.27337002754211426-0.9032229781150818*I, -0.8374675512313843+0.0479498989880085*I, 0.6916252374649048+0.45711082220077515*I, 0.1883818507194519+0.06482727080583572*I, -0.32384994626045227+0.05857187137007713*I,
+ -1.8306152820587158-1.2336910963058472*I, 0.5096428990364075-0.5395973920822144*I, -1.833838701248169+0.7064958810806274*I, -1.956626057624817+0.22825956344604492*I, 1.706615924835205+0.0*I, -0.2895336151123047+0.17579378187656403*I, -0.923172116279602-0.4530014097690582*I, 0.5040621757507324-0.37026339769363403*I, -0.2824432849884033-1.0374568700790405*I, 0.1399831622838974+0.4977008104324341*I,
+ 0.32275113463401794+0.015575028955936432*I, -0.7285097241401672-0.10360407829284668*I, 0.041852742433547974-0.655687689781189*I, 0.07081800699234009-0.318013072013855*I, -0.25947219133377075+0.4878614842891693*I, 1.5735365152359009+0.0*I, -0.2647853195667267-0.26654252409935*I, -0.6190430521965027-0.24699924886226654*I, -0.6288471221923828+0.48154571652412415*I, 0.02446540631353855-0.2611822783946991*I,
+ 2.1968812942504883+1.0640623569488525*I, -1.1760060787200928-2.714695692062378*I, 2.5673024654388428+1.9732997417449951*I, 0.3698374927043915-0.54008549451828*I, -0.4763622283935547-0.27821826934814453*I, -1.6697118282318115+0.4017511010169983*I, 1.2674795389175415+0.0*I, 0.3079095482826233-0.07258892804384232*I, -0.5929520130157471-0.038360968232154846*I, 0.04388086497783661-0.025549031794071198*I,
+ 0.27894386649131775+0.9791183471679688*I, -0.42710840702056885+0.0428999662399292*I, -1.1148382425308228-0.1569381207227707*I, 0.8068630695343018+1.5315914154052734*I, -0.6160865426063538-2.0185799598693848*I, -1.439787745475769-0.7550917863845825*I, -0.10051321983337402+0.24303960800170898*I, 0.9066106081008911+0.0*I, 0.05315789580345154-0.06136537343263626*I, -0.21304509043693542+0.6494344472885132*I,
+ 3.0476584434509277+0.1854848861694336*I, -1.7228562831878662+2.8335886001586914*I, 2.4704504013061523-1.0389463901519775*I, 1.564915418624878-1.6229296922683716*I, -2.7767486572265625+1.769376516342163*I, -0.314566969871521-1.0403450727462769*I, 1.4415971040725708+0.29750674962997437*I, -0.5856801271438599-1.0203559398651123*I, 0.5668219923973083+0.0*I, 0.033351436257362366-0.07832501083612442*I,
+ 0.3842993974685669+0.7050991058349609*I, 1.894256591796875+0.6389734745025635*I, 1.085827112197876-1.2980060577392578*I, -0.11207738518714905+1.2014245986938477*I, 0.04810279607772827-0.9741873741149902*I, -0.31978556513786316+0.13701045513153076*I, 1.2217860221862793-0.856549859046936*I, 0.7103452086448669+0.84221351146698*I, -0.9617416858673096-1.2486815452575684*I, 0.0756804421544075+0.0*I};
+ openblas_complex_double B[20] = {-0.21782716937787788-0.9222220085490986*I, -0.7620356655676837+0.15533508334193666*I, -0.905011814118756+0.2847570854574069*I, -0.3451346708401685+1.076948486041297*I, 0.25336108035924787+0.975317836492159*I, 0.11192755545114-0.1603741874112385*I, -0.20604111555491242+0.10570814584017311*I, -1.0568488936791578-0.06025820467086475*I, -0.6650468984506477-0.5000967284800251*I, -1.0509472322215125+0.5022165705328413*I,
+ -0.727775859267237+0.50638268521728*I, 0.39947219167701153-0.4576746001199889*I, -0.7122162951294634-0.630289556702497*I, 0.9870834574024372-0.2825689605519449*I, 0.0628393808469436-0.1253397353973715*I, 0.8439562576196216+1.0850814110398734*I, 0.562377322638969-0.2578030745663871*I, 0.12696236014017806-0.09853584666755086*I, -0.023682508769195098+0.18093440285319276*I, -0.7264975746431271+0.31670415674097235*I};
+ char lo = 'L';
+ blasint nrhs = 2;
+ BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info);
+
+ // note that this is exactly equal to A1
+ openblas_complex_float A3[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I,
+ -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I,
+ 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I,
+ 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I,
+ -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I,
+ 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I,
+ 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I,
+ 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I,
+ 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I,
+ 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I};
+
+ BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info);
+ // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91]));
+ if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) {
+ CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__);
+ }
+}