From 894fde9bfe36fe1988b595d3529a7f808a5a6534 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 19 Dec 2021 21:21:47 +0100
Subject: [PATCH 01/77] Update version to 0.3.19.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1d69da13..913017c63 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 19)
+set(OpenBLAS_PATCH_VERSION 19.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From 8cec83bdfb82effda2075309af5ca36df79f1a8e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 19 Dec 2021 21:22:19 +0100
Subject: [PATCH 02/77] Update version to 0.3.19.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 3359860b9..4b4b9bcf9 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.19
+VERSION = 0.3.19.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 8d9b9c6b2a6f015cafcf3e0e568874a1aabcc223 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 21 Dec 2021 09:22:59 +0800
Subject: [PATCH 03/77] loongarch64: Optimize dgemm_kernel

---
 kernel/loongarch64/KERNEL.LOONGSON3R5  |   15 +-
 kernel/loongarch64/dgemm_kernel_16x4.S | 4250 ++++++++++++++++++++++++
 kernel/loongarch64/dgemm_ncopy_16.S    |  691 ++++
 kernel/loongarch64/dgemm_ncopy_4.S     |  237 ++
 kernel/loongarch64/dgemm_tcopy_16.S    |  710 ++++
 kernel/loongarch64/dgemm_tcopy_4.S     |  270 ++
 param.h                                |   10 +-
 7 files changed, 6177 insertions(+), 6 deletions(-)
 create mode 100644 kernel/loongarch64/dgemm_kernel_16x4.S
 create mode 100644 kernel/loongarch64/dgemm_ncopy_16.S
 create mode 100644 kernel/loongarch64/dgemm_ncopy_4.S
 create mode 100644 kernel/loongarch64/dgemm_tcopy_16.S
 create mode 100644 kernel/loongarch64/dgemm_tcopy_4.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index cce4093e3..bb0441ab2 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1 +1,14 @@
-#TODO: Add loongarch64 SIMD optimizations
+DGEMMKERNEL    = dgemm_kernel_16x4.S
+DGEMMINCOPY    = dgemm_ncopy_16.S
+DGEMMITCOPY    = dgemm_tcopy_16.S
+DGEMMONCOPY    = dgemm_ncopy_4.S
+DGEMMOTCOPY    = dgemm_tcopy_4.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S
new file mode 100644
index 000000000..13faa977e
--- /dev/null
+++ b/kernel/loongarch64/dgemm_kernel_16x4.S
@@ -0,0 +1,4250 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25 /* !! DO NOT USE $r21 and $r22 !! */
+#define T1     $r26
+#define T2     $r27
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define D0     $xr7
+#define D1     $xr8
+#define D2     $xr9
+#define D3     $xr10
+#define D4     $xr11
+#define D5     $xr12
+#define D6     $xr13
+#define D7     $xr14
+#define D8     $xr15
+#define D9     $xr16
+#define D10    $xr17
+#define D11    $xr18
+#define D12    $xr19
+#define D13    $xr20
+#define D14    $xr21
+#define D15    $xr22
+#define VALPHA $xr23
+
+/* Prefetch interval */
+#define A_PRE  0x200
+#define B_PRE  0x100
+
+    PROLOGUE
+
+    addi.d   $sp,   $sp,   -56
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f23,  $sp,   40
+    ST       ALPHA, $sp,   48
+
+    /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */
+    xvld         VALPHA, $sp,  48
+    xvreplve0.d  VALPHA, VALPHA
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    beq      ZERO,  J,     .L_N3
+
+.L_J1: /* J-- && This loop include Condition 1 */
+
+/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
+
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    preld          0,  C0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    preld     0,   C0, 0x40
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    preld          0,   C1,    0x00
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+    preld     0,   C1,    0x40
+    xvfmul.d  D6,  U2, U4
+    xvfmul.d  D7,  U3, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    preld          0,   C2,    0x00
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D9,  U1, U4
+    preld     0,   C2,    0x40
+    xvfmul.d  D10, U2, U4
+    xvfmul.d  D11, U3, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    preld          0,   C3,    0x00
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+    xvfmul.d  D13, U1, U4
+    preld     0,   C3,    0x40
+    xvfmul.d  D14, U2, U4
+    xvfmul.d  D15, U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    /* Calculate 8 sets of D0~D15 */
+.L_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_TL1
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_L0
+
+.L_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_L71
+
+.L_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D6,   D6,  VALPHA
+    xvfmul.d  D7,   D7,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
+    xvfmul.d  D10,  D10, VALPHA
+    xvfmul.d  D11,  D11, VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
+    xvfmul.d  D14,  D14, VALPHA
+    xvfmul.d  D15,  D15, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvld      U2,  C1,  0x40
+    xvld      U3,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvfmadd.d D6,  D6,  VALPHA,  U2
+    xvfmadd.d D7,  D7,  VALPHA,  U3
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvld      U1,  C2,  0x20
+    xvld      U2,  C2,  0x40
+    xvld      U3,  C2,  0x60
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvfmadd.d D9,  D9,  VALPHA,  U1
+    xvfmadd.d D10, D10, VALPHA,  U2
+    xvfmadd.d D11, D11, VALPHA,  U3
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvld      U1,  C3,  0x20
+    xvld      U2,  C3,  0x40
+    xvld      U3,  C3,  0x60
+    xvfmadd.d D12, D12, VALPHA,  U0
+    xvfmadd.d D13, D13, VALPHA,  U1
+    xvfmadd.d D14, D14, VALPHA,  U2
+    xvfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    xvst      D6,  C1,  0x40
+    xvst      D7,  C1,  0x60
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    xvst      D10, C2,  0x40
+    xvst      D11, C2,  0x60
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
+    xvst      D14, C3,  0x40
+    xvst      D15, C3,  0x60
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x80
+    addi.d    C1,  C1,  0x80
+    addi.d    C2,  C2,  0x80
+    addi.d    C3,  C3,  0x80
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    addi.d    L,   L,   -16
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x07
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF, OFF, 0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
+
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif  // #if defined(TRMMKERNEL)
+
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D9,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+    xvfmul.d  D13, U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
+
+.L_M8_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M8_TL1
+
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M8_L0
+
+.L_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M8_L71
+
+.L_M8_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvld      U1,  C2,  0x20
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvfmadd.d D9,  D9,  VALPHA,  U1
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvld      U1,  C3,  0x20
+    xvfmadd.d D12, D12, VALPHA,  U0
+    xvfmadd.d D13, D13, VALPHA,  U1
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -8
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 8)) End************/
+
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+
+.L_M4_TL1: /* TL-- */
+           /***8-1***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M4_TL1
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+
+.L_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+
+.L_M4_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U0
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+.L_M2_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+
+.L_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+
+.L_M2_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U0
+#endif   // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+    xvstelm.d D8,  C2,  0x00,    0x00
+    xvstelm.d D12, C3,  0x00,    0x00
+    xvstelm.d D0,  C0,  0x08,    0x01
+    xvstelm.d D4,  C1,  0x08,    0x01
+    xvstelm.d D8,  C2,  0x08,    0x01
+    xvstelm.d D12, C3,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+
+.L_M1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+
+.L_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+
+.L_M1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U0
+#endif   // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+    xvstelm.d D8,  C2,  0x00,    0x00
+    xvstelm.d D12, C3,  0x00,    0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+    addi.d    C2,  C2,  0x08
+    addi.d    C3,  C3,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -1
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 1) ) End************/
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    /* since the array type is double,
+     * so we must mul 32
+     */
+    slli.d    T0,   K,   5
+    slli.d    T1,   LDC, 5
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x04
+#endif
+
+    blt      ZERO,  J,   .L_J1
+
+//////////////// go back to L_J1 /////////////////
+/////////////////////////////////////////////////
+/************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/
+
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+/************************* Condition 2 if((N & 2) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N3_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N3_M8
+
+.L_N3_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+    xvfmul.d  D6,  U2, U4
+    xvfmul.d  D7,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
+
+.L_N3_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_TL1
+
+.L_N3_L7:
+    /* if (!(L & 7)) goto L_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_L0
+
+.L_N3_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_L71
+
+.L_N3_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D6,   D6,  VALPHA
+    xvfmul.d  D7,   D7,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvld      U2,  C1,  0x40
+    xvld      U3,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvfmadd.d D6,  D6,  VALPHA,  U2
+    xvfmadd.d D7,  D7,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    xvst      D6,  C1,  0x40
+    xvst      D7,  C1,  0x60
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x80
+    addi.d    C1,  C1,  0x80
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -16
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x07
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N3_I1
+
+.L_N3_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N3_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N3_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M8_L7 */
+    beq       ZERO,TL, .L_N3_M8_L7
+
+.L_N3_M8_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M8_TL1
+
+.L_N3_M8_L7:
+    /* if (!(L & 7)) goto L_N3_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M8_L0
+
+.L_N3_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M8_L71
+
+.L_N3_M8_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2) && (M & 8) ) End************/
+
+.L_N3_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
+
+.L_N3_M4_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M4_TL1
+
+.L_N3_M4_L7:
+    /* if (!(L & 7)) goto L_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M4_L0
+
+.L_N3_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M4_L71
+
+.L_N3_M4_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 4) ) End************/
+
+.L_N3_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N3_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M2_L7 */
+    beq       ZERO,TL, .L_N3_M2_L7
+
+.L_N3_M2_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M2_TL1
+
+.L_N3_M2_L7:
+    /* if (!(L & 7)) goto L_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M2_L0
+
+.L_N3_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M2_L71
+
+.L_N3_M2_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+    xvstelm.d D0,  C0,  0x08,    0x01
+    xvstelm.d D4,  C1,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 2) ) End************/
+
+.L_N3_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N3_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
+
+.L_N3_M1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M1_TL1
+
+.L_N3_M1_L7:
+    /* if (!(L & 7)) goto L_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M1_L0
+
+.L_N3_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M1_L71
+
+.L_N3_M1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 1) ) End************/
+
+.L_N3_M0:
+    /* Add stride for B and C
+     * B += (K * 16)
+     * C += (LDC * 16)
+     */
+    /* since the array type is double,
+     * so we must mul 16
+     */
+    slli.d    T0,   K,   4
+    slli.d    T1,   LDC, 4
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x02
+#endif
+
+    /* We must reinit I */
+    srai.d   I,     M,   4     /* I = bm >> 4 */
+
+/************************* Condition 2 if((N & 2) && (M >> 4)) End !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+
+/************************* Condition 3 if((N & 1) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N1_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M8
+
+.L_N1_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_L7 */
+    beq       ZERO,TL, .L_N1_L7
+
+.L_N1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_TL1
+
+.L_N1_L7:
+    /* if (!(L & 7)) goto L_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_L0
+
+.L_N1_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_L71
+
+.L_N1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x80
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -16
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x07
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
+
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
+
+.L_N1_M8_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M8_TL1
+
+.L_N1_M8_L7:
+    /* if (!(L & 7)) goto L_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M8_L0
+
+.L_N1_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M8_L71
+
+.L_N1_M8_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1) && (M & 8) ) End************/
+
+.L_N1_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+
+.L_N1_M4_TL1: /* TL-- */
+           /***8-1***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+
+.L_N1_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+
+.L_N1_M4_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1) && (M & 4) ) End************/
+
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+
+.L_N1_M2_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+
+.L_N1_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+
+.L_N1_M2_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C0,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 2) ) End************/
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+.L_N1_M1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+
+.L_N1_M1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 1) ) End************/
+
+.L_N1_M0:
+
+/************************* Condition 3 if((N & 1) && (M >> 4)) End !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+.L_N0:
+    /* Restore regs */
+    LDARG    $r23,  $sp,   0
+    LDARG    $r24,  $sp,   8
+    LDARG    $r25,  $sp,   16
+    LDARG    $r26,  $sp,   24
+    LDARG    $r27,  $sp,   32
+    LD       $f23,  $sp,   40
+    addi.d   $sp,   $sp,   56
+
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S
new file mode 100644
index 000000000..95c879031
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_16.S
@@ -0,0 +1,691 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define S11    $r24
+#define S12    $r25
+#define S13    $r26
+#define S14    $r27
+#define S15    $r28
+#define S16    $r29
+#define TD     $r30
+#define TS     $r31
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -0x90
+    SDARG      $r23, $sp,  0x00
+    SDARG      $r24, $sp,  0x08
+    SDARG      $r25, $sp,  0x10
+    SDARG      $r26, $sp,  0x18
+    SDARG      $r27, $sp,  0x20
+    SDARG      $r28, $sp,  0x28
+    SDARG      $r29, $sp,  0x30
+    SDARG      $r30, $sp,  0x38
+    SDARG      $r31, $sp,  0x40
+    ST         $f23, $sp,  0x48
+    ST         $f24, $sp,  0x50
+    ST         $f25, $sp,  0x58
+    ST         $f26, $sp,  0x60
+    ST         $f27, $sp,  0x68
+    ST         $f28, $sp,  0x70
+    ST         $f29, $sp,  0x78
+    ST         $f30, $sp,  0x80
+    ST         $f31, $sp,  0x88
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x04
+    beq        J,    ZERO, .L_N8
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    addi.d     J,    J,    -1
+    add.d      S4,   S3,   TL
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      S9,   S7,   T0
+    add.d      S10,  S8,   T0
+    add.d      S11,  S9,   T0
+    add.d      S12,  S10,  T0
+    add.d      S13,  S11,  T0
+    add.d      S14,  S12,  T0
+    add.d      S15,  S13,  T0
+    add.d      S16,  S14,  T0
+    add.d      TS,   S15,  T0
+    beq        I,    ZERO, .L_I7
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+    xvld       U8,   S9,   0x00
+    xvld       U9,   S10,  0x00
+    xvld       U10,  S11,  0x00
+    xvld       U11,  S12,  0x00
+    xvld       U12,  S13,  0x00
+    xvld       U13,  S14,  0x00
+    xvld       U14,  S15,  0x00
+    xvld       U15,  S16,  0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+    xvld       U8,   S9,   0x20
+    xvld       U9,   S10,  0x20
+    xvld       U10,  S11,  0x20
+    xvld       U11,  S12,  0x20
+    xvld       U12,  S13,  0x20
+    xvld       U13,  S14,  0x20
+    xvld       U14,  S15,  0x20
+    xvld       U15,  S16,  0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+    addi.d     S9,   S9,   0x40
+    addi.d     S10,  S10,  0x40
+    addi.d     S11,  S11,  0x40
+    addi.d     S12,  S12,  0x40
+    addi.d     S13,  S13,  0x40
+    addi.d     S14,  S14,  0x40
+    addi.d     S15,  S15,  0x40
+    addi.d     S16,  S16,  0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_I0
+
+.L_II1: /* I-- */
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+    addi.d    TD,    TD,  0x40
+
+    fld.d     F0,    S9,  0x00
+    fld.d     F1,    S10, 0x00
+    fld.d     F2,    S11, 0x00
+    fld.d     F3,    S12, 0x00
+    fld.d     F4,    S13, 0x00
+    fld.d     F5,    S14, 0x00
+    fld.d     F6,    S15, 0x00
+    fld.d     F7,    S16, 0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S9,    S9,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S10,   S10, 0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S11,   S11, 0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S12,   S12, 0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S13,   S13, 0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S14,   S14, 0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S15,   S15, 0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S16,   S16, 0x08
+    addi.d    TD,    TD,  0x40
+
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+
+.L_N8:
+    andi      J,     N,   0x08
+    beq       ZERO,  J,   .L_N4
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      TS,   S7,   T0
+    beq        I,    ZERO, .L_8I3
+
+.L_8I1:  /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_8I1
+
+.L_8I3:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_N4
+
+.L_8I11:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+
+    addi.d    TD,    TD,  0x40
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_8I11
+
+.L_N4:
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    beq        I,    ZERO, .L_I3
+
+.L_4I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 1
+    xvpermi.q  D2,   U0,   0x31  // 2
+    xvpermi.q  D3,   U1,   0x31  // 3
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_4I1
+
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N2
+
+.L_4II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_4II1
+
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_NI1
+
+.L_2I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+
+    xvpermi.q  D0,   D1,   0x02  // 0
+
+    xvst       D0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+
+.L_NI1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    addi.d    TD,    TD,  0x10
+
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+
+.L_N0:
+    LDARG      $r23, $sp,  0x00
+    LDARG      $r24, $sp,  0x08
+    LDARG      $r25, $sp,  0x10
+    LDARG      $r26, $sp,  0x18
+    LDARG      $r27, $sp,  0x20
+    LDARG      $r28, $sp,  0x28
+    LDARG      $r29, $sp,  0x30
+    LDARG      $r30, $sp,  0x38
+    LDARG      $r31, $sp,  0x40
+    LD         $f23, $sp,  0x48
+    LD         $f24, $sp,  0x50
+    LD         $f25, $sp,  0x58
+    LD         $f26, $sp,  0x60
+    LD         $f27, $sp,  0x68
+    LD         $f28, $sp,  0x70
+    LD         $f29, $sp,  0x78
+    LD         $f30, $sp,  0x80
+    LD         $f31, $sp,  0x88
+    addi.d     $sp,  $sp,  0x90
+    jirl       $r0,  $r1,  0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_4.S b/kernel/loongarch64/dgemm_ncopy_4.S
new file mode 100644
index 000000000..b1f322a06
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_4.S
@@ -0,0 +1,237 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr14
+#define D1     $xr8
+#define D2     $xr9
+#define D3     $xr10
+#define D4     $xr11
+#define D5     $xr12
+#define D6     $xr13
+#define D7     $xr15
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO, .L_N2
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    addi.d     J,    J,    -1
+    beq        I,    ZERO, .L_I3
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 1
+    xvpermi.q  D2,   U0,   0x31  // 2
+    xvpermi.q  D3,   U1,   0x31  // 3
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_I0
+
+.L_II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_2I3
+
+.L_2I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D1,   0x02  // 0
+    xvpermi.q  D1,   U0,   0x31  // 1
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+
+.L_2I3:
+    andi       I,    M,    0x03
+    beq        ZERO, I,    .L_N1
+
+.L_2II1: /* I-- */
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S2,   0x00
+    fst.d      F0,   TD,   0x00
+    addi.d     I,    I,    -1
+    fst.d      F1,   TD,   0x08
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+    blt        ZERO, I,    .L_2II1
+
+.L_N1:
+    andi       J,    N,    0x01
+    beq        ZERO, J,    .L_N0
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+    beq        ZERO, I,    .L_1I3
+
+.L_1I1:
+    xvld       U0,   S1,   0x00
+    addi.d     S1,   S1,   0x20
+    xvst       U0,   TD,   0x00
+    addi.d     I,    I,    -1
+    addi.d     TD,   TD,   0x20
+    blt        ZERO, I,    .L_1I1
+
+.L_1I3:
+    andi       I,    M,    0x03
+    beq        ZERO, I,    .L_N0
+
+.L_1II1:
+    fld.d      F0,   S1,   0x00
+    addi.d     S1,   S1,   0x08
+    fst.d      F0,   TD,   0x00
+    addi.d     I,    I,    -1
+    addi.d     TD,   TD,   0x08
+    blt        ZERO, I,    .L_1II1
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_16.S b/kernel/loongarch64/dgemm_tcopy_16.S
new file mode 100644
index 000000000..afafe5b37
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_16.S
@@ -0,0 +1,710 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -56
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    SDARG      $r28,   $sp,   40
+    SDARG      $r29,   $sp,   48
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x04
+    srai.d     T1,     N,     0x03
+    slli.d     T0,     T0,    0x04
+    slli.d     T1,     T1,    0x03
+    mul.d      P2,     M,     T0
+    mul.d      P3,     M,     T1
+    slli.d     P2,     P2,    0x03
+    slli.d     P3,     P3,    0x03
+    add.d      P2,     DST,   P2
+    add.d      P3,     DST,   P3
+
+    srai.d     T0,     N,     0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T0,     T0,    0x02
+    slli.d     T1,     T1,    0x01
+    mul.d      P4,     M,     T0
+    mul.d      P5,     M,     T1
+    slli.d     P4,     P4,    0x03
+    slli.d     P5,     P5,    0x03
+    add.d      P4,     DST,   P4
+    add.d      P5,     DST,   P5
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x03
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x07
+    beq        ZERO,   J,     .L_M7
+
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S5,     S3,    T0
+    add.d      S6,     S4,    T0
+    add.d      S7,     S5,    T0
+    add.d      S8,     S6,    T0
+    add.d      S0,     S7,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x400
+
+    srai.d     I,      N,     0x04
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N15
+
+.L_I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S5,    0x40
+    xvld       U3,     S5,    0x60
+    xvld       U4,     S6,    0x00
+    xvld       U5,     S6,    0x20
+    xvld       U6,     S6,    0x40
+    xvld       U7,     S6,    0x60
+
+    xvst       U0,     P1,    0x200
+    xvst       U1,     P1,    0x220
+    xvst       U2,     P1,    0x240
+    xvst       U3,     P1,    0x260
+    xvst       U4,     P1,    0x280
+    xvst       U5,     P1,    0x2A0
+    xvst       U6,     P1,    0x2C0
+    xvst       U7,     P1,    0x2E0
+
+    xvld       U0,     S7,    0x00
+    xvld       U1,     S7,    0x20
+    xvld       U2,     S7,    0x40
+    xvld       U3,     S7,    0x60
+    xvld       U4,     S8,    0x00
+    xvld       U5,     S8,    0x20
+    xvld       U6,     S8,    0x40
+    xvld       U7,     S8,    0x60
+
+    xvst       U0,     P1,    0x300
+    xvst       U1,     P1,    0x320
+    xvst       U2,     P1,    0x340
+    xvst       U3,     P1,    0x360
+    xvst       U4,     P1,    0x380
+    xvst       U5,     P1,    0x3A0
+    xvst       U6,     P1,    0x3C0
+    xvst       U7,     P1,    0x3E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     S5,     S5,    0x80
+    addi.d     S6,     S6,    0x80
+    addi.d     S7,     S7,    0x80
+    addi.d     S8,     S8,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_I1
+
+.L_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S6,    0x00
+    xvld       U3,     S6,    0x20
+    xvld       U4,     S7,    0x00
+    xvld       U5,     S7,    0x20
+    xvld       U6,     S8,    0x00
+    xvld       U7,     S8,    0x20
+
+    xvst       U0,     P2,    0x100
+    xvst       U1,     P2,    0x120
+    xvst       U2,     P2,    0x140
+    xvst       U3,     P2,    0x160
+    xvst       U4,     P2,    0x180
+    xvst       U5,     P2,    0x1A0
+    xvst       U6,     P2,    0x1C0
+    xvst       U7,     P2,    0x1E0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     S5,     S5,    0x40
+    addi.d     S6,     S6,    0x40
+    addi.d     S7,     S7,    0x40
+    addi.d     S8,     S8,    0x40
+    addi.d     P2,     P2,    0x200
+
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+    xvst       U4,     P3,    0x80
+    xvst       U5,     P3,    0xA0
+    xvst       U6,     P3,    0xC0
+    xvst       U7,     P3,    0xE0
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     S5,     S5,    0x20
+    addi.d     S6,     S6,    0x20
+    addi.d     S7,     S7,    0x20
+    addi.d     S8,     S8,    0x20
+    addi.d     P3,     P3,    0x100
+
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+    xvpermi.q  U4,     U5,    0x02
+    xvpermi.q  U6,     U7,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+    xvst       U4,     P4,    0x40
+    xvst       U6,     P4,    0x60
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     S5,     S5,    0x10
+    addi.d     S6,     S6,    0x10
+    addi.d     S7,     S7,    0x10
+    addi.d     S8,     S8,    0x10
+    addi.d     P4,     P4,    0x80
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+    fld.d      F4,     S5,    0x00
+    fld.d      F5,     S6,    0x00
+    fld.d      F6,     S7,    0x00
+    fld.d      F7,     S8,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+    fst.d      F2,     P5,    0x10
+    fst.d      F3,     P5,    0x18
+    fst.d      F4,     P5,    0x20
+    fst.d      F5,     P5,    0x28
+    fst.d      F6,     P5,    0x30
+    fst.d      F7,     P5,    0x38
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     S5,     S5,    0x08
+    addi.d     S6,     S6,    0x08
+    addi.d     S7,     S7,    0x08
+    addi.d     S8,     S8,    0x08
+    addi.d     P5,     P5,    0x40
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x200
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_4N15
+
+.L_4I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_4I1
+
+.L_4N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_4N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     P2,     P2,    0x100
+
+.L_4N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_4N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     P3,     P3,    0x80
+
+.L_4N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_4N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P4,     P4,    0x40
+
+.L_4N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+    fst.d      F2,     P5,    0x10
+    fst.d      F3,     P5,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P5,     P5,    0x20
+
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S0,     S0,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x100
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_2N15
+
+.L_2I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_2N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     P2,     P2,    0x80
+
+.L_2N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_2N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     P3,     P3,    0x40
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+
+    xvst       U0,     P4,    0x00
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     P4,     P4,    0x20
+
+.L_2N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     P5,     P5,    0x10
+
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_1N15
+
+.L_1I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    addi.d     S1,     S1,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_1I1
+
+.L_1N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_1N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+
+    addi.d     S1,     S1,    0x40
+    addi.d     P2,     P2,    0x40
+
+.L_1N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_1N3
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     P3,    0x00
+
+    addi.d     S1,     S1,    0x20
+    addi.d     P3,     P3,    0x20
+
+.L_1N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_1N1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S1,    0x08
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+
+    addi.d     S1,     S1,    0x10
+    addi.d     P4,     P4,    0x10
+
+.L_1N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    fld.d      F0,     S1,    0x00
+
+    fst.d      F0,     P5,    0x00
+
+    addi.d     S1,     S1,    0x08
+    addi.d     P5,     P5,    0x08
+
+.L_M0:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LDARG      $r28,   $sp,   40
+    LDARG      $r29,   $sp,   48
+    addi.d     $sp,    $sp,   56
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_4.S b/kernel/loongarch64/dgemm_tcopy_4.S
new file mode 100644
index 000000000..700989ca1
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_4.S
@@ -0,0 +1,270 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define P0     $r16
+#define P1     $r17
+#define P2     $r18
+#define P3     $r19
+#define T0     $r20
+#define T1     $r23
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -8
+    SDARG      $r23,   $sp,   0
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x02
+    slli.d     T0,     T0,    0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T1,     T1,    0x01
+    mul.d      T0,     M,     T0
+    mul.d      T1,     M,     T1
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x03
+    add.d      P2,     DST,   T0
+    add.d      P3,     DST,   T1
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x02
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x05
+    beq        ZERO,   J,     .L_M3
+
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x02
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N3
+
+.L_I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    add.d      P1,     P1,    T1
+
+    addi.d     I,      I,    -1
+    blt        ZERO,   I,    .L_I1
+
+.L_N3:
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+
+    xvst       U0,     P2,    0x00
+    xvst       U2,     P2,    0x20
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P2,     P2,    0x40
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P3,    0x00
+    fst.d      F1,     P3,    0x08
+    fst.d      F2,     P3,    0x10
+    fst.d      F3,     P3,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P3,     P3,    0x20
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M3:
+    andi       J,      M,      0x02
+    beq        ZERO,   J,      .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,     TL
+    add.d      S0,     S0,     T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,     0x40
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_2N3
+
+.L_2I1:   /* I-- */
+    xvld       U0,     S1,     0x00
+    xvld       U1,     S2,     0x00
+
+    xvst       U0,     P1,     0x00
+    xvst       U1,     P1,     0x20
+
+    addi.d     S1,     S1,     0x20
+    addi.d     S2,     S2,     0x20
+    addi.d     I,      I,      -1
+    add.d      P1,     P1,     T1
+
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    xvld       U0,     S1,     0x00
+    xvld       U1,     S2,     0x00
+
+    xvpermi.q  U0,     U1,     0x02
+
+    xvst       U0,     P2,     0x00
+
+    addi.d     S1,     S1,     0x10
+    addi.d     S2,     S2,     0x10
+    addi.d     P2,     P2,     0x20
+
+.L_2N1:
+    addi.d     I,      N,      0x01
+    beq        ZERO,   I,      .L_M1
+
+    fld.d      F0,     S1,     0x00
+    fld.d      F1,     S2,     0x00
+
+    fst.d      F0,     P3,     0x00
+    fst.d      F1,     P3,     0x08
+
+    addi.d     S1,     S1,     0x08
+    addi.d     S2,     S2,     0x08
+    addi.d     P3,     P3,     0x10
+
+.L_M1:
+    andi       J,      M,      0x01
+    beq        ZERO,   J,      .L_M0
+
+    move       S1,     S0
+    move       P1,     P0
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_1N3
+
+.L_1I1:
+    xvld       U0,    S1,      0x00
+
+    xvst       U0,    P1,      0x00
+
+    addi.d     S1,    S1,      0x20
+    addi.d     I,     I,       -1
+    add.d      P1,    P1,      T1
+
+    blt        ZERO,  I,       .L_1I1
+
+.L_1N3:
+    andi       I,     N,       0x02
+    beq        I,     ZERO,    .L_1N1
+
+    fld.d      F0,    S1,      0x00
+    fld.d      F1,    S1,      0x08
+
+    fst.d      F0,    P2,      0x00
+    fst.d      F1,    P2,      0x08
+
+    addi.d     S1,    S1,      0x10
+    addi.d     P2,    P2,      0x10
+
+.L_1N1:
+    andi       I,     N,       0x01
+    beq        I,     ZERO,    .L_M0
+
+    fld.d      F0,    S1,      0x00
+
+    fst.d      F0,    P3,      0x00
+
+.L_M0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
diff --git a/param.h b/param.h
index 8dd2a7461..2dffaae3c 100644
--- a/param.h
+++ b/param.h
@@ -2852,35 +2852,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_N 8
-#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 4
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_N 4
 #define XGEMM_DEFAULT_UNROLL_N 1
 
 #define SGEMM_DEFAULT_UNROLL_M 2
-#define DGEMM_DEFAULT_UNROLL_M 2
+#define DGEMM_DEFAULT_UNROLL_M 16
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 1
 #define ZGEMM_DEFAULT_UNROLL_M 1
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_P sgemm_p
-#define DGEMM_DEFAULT_P dgemm_p
+#define DGEMM_DEFAULT_P 32
 #define QGEMM_DEFAULT_P qgemm_p
 #define CGEMM_DEFAULT_P cgemm_p
 #define ZGEMM_DEFAULT_P zgemm_p
 #define XGEMM_DEFAULT_P xgemm_p
 
 #define SGEMM_DEFAULT_R sgemm_r
-#define DGEMM_DEFAULT_R dgemm_r
+#define DGEMM_DEFAULT_R 858
 #define QGEMM_DEFAULT_R qgemm_r
 #define CGEMM_DEFAULT_R cgemm_r
 #define ZGEMM_DEFAULT_R zgemm_r
 #define XGEMM_DEFAULT_R xgemm_r
 
 #define SGEMM_DEFAULT_Q 128
-#define DGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 152
 #define QGEMM_DEFAULT_Q 128
 #define CGEMM_DEFAULT_Q 128
 #define ZGEMM_DEFAULT_Q 128

From e3c9947c0f4338abc437126283576b63a2203623 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Tue, 21 Dec 2021 11:19:27 +0100
Subject: [PATCH 04/77] prepare kernel for sve zgemm

---
 kernel/arm64/KERNEL.A64FX | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index 80be4ddd0..04be0fab9 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -169,15 +169,24 @@ CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
+ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
+DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
+DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
+DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+
+DSYMMUCOPY_M    =  symm_ucopy_sve.c
+DSYMMLCOPY_M    =  symm_lcopy_sve.c
+

From 07fe5b19a4957cafe3864e4af0296eb575a2e2f3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Dec 2021 12:31:54 +0100
Subject: [PATCH 05/77] typecast function pointers

---
 driver/others/blas_server.c | 40 ++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index fa07a1ea4..ec79075fe 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	    /* REAL / Double */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
 			  double *, BLASLONG, double *, BLASLONG,
-			  double *, BLASLONG, void *) = func;
+			  double *, BLASLONG, void *) =  (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, 
+			  double *, BLASLONG, double *, BLASLONG, void *)) func;
 
 	    afunc(args -> m, args -> n, args -> k,
 		  ((double *)args -> alpha)[0],
@@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / Single */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
                           float *, BLASLONG, float *, BLASLONG,
-                          float *, BLASLONG, void *) = func;
+                          float *, BLASLONG, void *) = (void (*)
+                          (BLASLONG, BLASLONG, BLASLONG, float,
+                          float *, BLASLONG, float *, BLASLONG,
+                          float *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((float *)args -> alpha)[0],
@@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / BFLOAT16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
                           bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
-                          bfloat16 *, BLASLONG, void *) = func;
+                          bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
+                          bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
+                          bfloat16 *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((bfloat16 *)args -> alpha)[0],
@@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / BLAS_STOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
                           float *, BLASLONG, bfloat16 *, BLASLONG,
-                          float *, BLASLONG, void *) = func;
+                          float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float,
+                          float *, BLASLONG, bfloat16 *, BLASLONG,
+                          float *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((float *)args -> alpha)[0],
@@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / BLAS_DTOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
                           double *, BLASLONG, bfloat16 *, BLASLONG,
-                          double *, BLASLONG, void *) = func;
+                          double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double,
+                          double *, BLASLONG, bfloat16 *, BLASLONG,
+                          double *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((double *)args -> alpha)[0],
@@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	  /* COMPLEX / Extended Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
 			xdouble *, BLASLONG, xdouble *, BLASLONG,
-			xdouble *, BLASLONG, void *) = func;
+			xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
+                        xdouble *, BLASLONG, xdouble *, BLASLONG,
+                        xdouble *, BLASLONG, void *)) func;
 
 	  afunc(args -> m, args -> n, args -> k,
 		((xdouble *)args -> alpha)[0],
@@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	    /* COMPLEX / Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
 			double *, BLASLONG, double *, BLASLONG,
-			double *, BLASLONG, void *) = func;
+			double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double,
+                        double *, BLASLONG, double *, BLASLONG,
+                        double *, BLASLONG, void *)) func;
 
 	  afunc(args -> m, args -> n, args -> k,
 		((double *)args -> alpha)[0],
@@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	    /* COMPLEX / Single */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
 			float *, BLASLONG, float *, BLASLONG,
-			float *, BLASLONG, void *) = func;
+			float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float,
+                        float *, BLASLONG, float *, BLASLONG,
+                        float *, BLASLONG, void *)) func;
 
 	  afunc(args -> m, args -> n, args -> k,
 		((float *)args -> alpha)[0],
@@ -425,7 +441,7 @@ blas_queue_t *tscq;
 #endif
 
     if (queue) {
-      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
+      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine;
 
       atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
 
@@ -503,7 +519,7 @@ blas_queue_t *tscq;
 	legacy_exec(routine, queue -> mode, queue -> args, sb);
       } else
 	if (queue -> mode & BLAS_PTHREAD) {
-	  void (*pthreadcompat)(void *) = queue -> routine;
+	  void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine;
 	  (pthreadcompat)(queue -> args);
 	} else
 	  (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
@@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
   fprintf(STDERR, "\n");
 #endif
 
-  routine = queue -> routine;
+  routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine;
 
   if (queue -> mode & BLAS_LEGACY) {
     legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
   } else
     if (queue -> mode & BLAS_PTHREAD) {
-      void (*pthreadcompat)(void *) = queue -> routine;
+      void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine;
       (pthreadcompat)(queue -> args);
     } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,

From d1ee6ff73fca6eecfb679d2a91c39ce91e80231b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Dec 2021 18:45:28 +0100
Subject: [PATCH 06/77] fix function typecasts

---
 kernel/x86_64/dasum.c | 2 +-
 kernel/x86_64/ddot.c  | 2 +-
 kernel/x86_64/drot.c  | 2 +-
 kernel/x86_64/sasum.c | 2 +-
 kernel/x86_64/srot.c  | 2 +-
 kernel/x86_64/zdot.c  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c
index 8af9e798b..a9c40f38f 100644
--- a/kernel/x86_64/dasum.c
+++ b/kernel/x86_64/dasum.c
@@ -114,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
         mode = BLAS_DOUBLE | BLAS_REAL;
 #endif
-        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index 5d0c32234..f3b9ee701 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 #endif
 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, y, inc_y, result, 0,
-				   ( void *)dot_thread_function, nthreads);
+				    (int (*)(void)) dot_thread_function, nthreads);
 
 		ptr = (RETURN_TYPE *)result;
 		for (i = 0; i < nthreads; i++) {
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
index ab5048bd1..40c9cf19d 100644
--- a/kernel/x86_64/drot.c
+++ b/kernel/x86_64/drot.c
@@ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 #else
 	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
 #endif
-	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads);
     }
 #else	
     rot_compute(n, x, inc_x, y, inc_y, c, s);
diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c
index a021741c7..37a92468f 100644
--- a/kernel/x86_64/sasum.c
+++ b/kernel/x86_64/sasum.c
@@ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
         mode = BLAS_DOUBLE | BLAS_REAL;
 #endif
-        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index 587cf8e40..a49544616 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 #else
 	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
 #endif
-	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads);
     }
 #else	
     rot_compute(n, x, inc_x, y, inc_y, c, s);
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 50c8a2678..c52575d07 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, y, inc_y, result, 0,
-				   ( void *)zdot_thread_function, nthreads);
+				   (int (*)(void))zdot_thread_function, nthreads);
 
 		ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
 		for (i = 0; i < nthreads; i++) {

From 64365c919e63baaef31f5c52d39ae53d77a98c85 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Dec 2021 18:47:35 +0100
Subject: [PATCH 07/77] fix function typecasts

---
 interface/axpy.c  | 2 +-
 interface/scal.c  | 2 +-
 interface/zaxpy.c | 4 ++--
 interface/zscal.c | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/interface/axpy.c b/interface/axpy.c
index eaa19f4df..5304ebec3 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
 #endif
 
     blas_level1_thread(mode, n, 0, 0, &alpha,
-		       x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads);
+		       x, incx, y, incy, NULL, 0,  (int (*)(void))AXPYU_K, nthreads);
 
   }
 #endif
diff --git a/interface/scal.c b/interface/scal.c
index 6d07b1650..0a7fee640 100644
--- a/interface/scal.c
+++ b/interface/scal.c
@@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
 #else
 		       &alpha,
 #endif
-		       x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);
+		       x, incx, NULL, 0, NULL, 0,  (int (*)(void))SCAL_K, nthreads);
 
   }
 #endif
diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index da3b48ead..0e168606d 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
 
     blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0,
 #ifndef CONJ
-		       (void *)AXPYU_K,
+                       (int (*)(void))AXPYU_K,
 #else
-		       (void *)AXPYC_K,
+                       (int (*)(void))AXPYC_K,
 #endif
 		       nthreads);
   }
diff --git a/interface/zscal.c b/interface/zscal.c
index bfaddc260..498377343 100644
--- a/interface/zscal.c
+++ b/interface/zscal.c
@@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
     mode  =  BLAS_SINGLE | BLAS_COMPLEX;
 #endif
 
-    blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);
+    blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads);
 
   }
 #endif

From c49d46f25f9c4f626f4a197b01bad749a9d5a7a6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Dec 2021 18:49:18 +0100
Subject: [PATCH 08/77] fix function typecast

---
 lapack/getrf/getrf_parallel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c
index fc410b0e7..fed5c1de5 100644
--- a/lapack/getrf/getrf_parallel.c
+++ b/lapack/getrf/getrf_parallel.c
@@ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 
     blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha,
 		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
-		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
+		       ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads);
 
     is += bk;
   }

From aecb4a5e8daab1b50ae34636001dfdb234948765 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Dec 2021 18:50:22 +0100
Subject: [PATCH 09/77] fix function typecasts

---
 lapack/lauum/lauum_L_parallel.c | 4 ++--
 lapack/lauum/lauum_U_parallel.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c
index 0ebe3f069..1b32e4519 100644
--- a/lapack/lauum/lauum_L_parallel.c
+++ b/lapack/lauum/lauum_L_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO,
-		&newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = i;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (i          ) * COMPSIZE;
 
     gemm_thread_n(mode | BLAS_TRANSA_T,
-		  &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;
diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c
index 7214c9731..f5ea54c88 100644
--- a/lapack/lauum/lauum_U_parallel.c
+++ b/lapack/lauum/lauum_U_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
-		&newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads);
 
     newarg.m = i;
     newarg.n = bk;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (    i * lda) * COMPSIZE;
 
     gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE,
-		  &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;

From 6b407a16cb089492d3ad1e2a1f5fdb71f4ffdd94 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Dec 2021 18:51:28 +0100
Subject: [PATCH 10/77] fix function typecasts

---
 lapack/potrf/potrf_L_parallel.c | 4 ++--
 lapack/potrf/potrf_U_parallel.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c
index 68ec8e22a..986816d1a 100644
--- a/lapack/potrf/potrf_L_parallel.c
+++ b/lapack/potrf/potrf_L_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + bk + i * lda) * COMPSIZE;
 
       gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
-		    &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;
@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
 #else
       syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
-		  &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads);
 #endif
     }
   }
diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c
index 3b5d39511..cc6ff9912 100644
--- a/lapack/potrf/potrf_U_parallel.c
+++ b/lapack/potrf/potrf_U_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
 
       gemm_thread_n(mode | BLAS_TRANSA_T,
-		    &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;
@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
 #else
       syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
-		  &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads);
 #endif
     }
   }

From 9809931eb46c483ff3e6ab301a262eb879072450 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Dec 2021 18:53:55 +0100
Subject: [PATCH 11/77] clean up unused variables and unreachable statements

---
 cpuid_x86.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 72e95214e..6466bd148 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -323,9 +323,11 @@ int get_vendor(void){
 
 int get_cputype(int gettype){
   int eax, ebx, ecx, edx;
+/*
   int extend_family, family;
   int extend_model, model;
   int type, stepping;
+*/
   int feature = 0;
 
   cpuid(1, &eax, &ebx, &ecx, &edx);
@@ -428,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
   cpuid(0, &cpuid_level, &ebx, &ecx, &edx);
 
   if (cpuid_level > 1) {
-    int numcalls =0 ;
+    int numcalls;
+    
     cpuid(2, &eax, &ebx, &ecx, &edx);
     numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries
     info[ 0] = BITMASK(eax,  8, 0xff);
@@ -1637,7 +1640,6 @@ int get_cpuname(void){
 	  else
 	    return CPUTYPE_BARCELONA;
         }
-	break;	      
       case 10: // Zen3		      
 	if(support_avx())
 #ifndef NO_AVX2
@@ -2193,7 +2195,6 @@ int get_coretype(void){
 	  else
 	    return CORE_NEHALEM;
 #endif	
-        break;    	
 
       case 7:
         if (model == 10) 
@@ -2582,4 +2583,4 @@ void get_sse(void){
   if (features & HAVE_FMA3 )    printf("HAVE_FMA3=1\n");
 
 }
-//}
\ No newline at end of file
+//}

From 2db0b2e4453b0a502cf336f6288688c23246d202 Mon Sep 17 00:00:00 2001
From: yuanhecai <yuanhecai@loongson.cn>
Date: Thu, 23 Dec 2021 20:04:27 +0800
Subject: [PATCH 12/77] Fixed MSA enabled optimization on Loongson-3A4000

---
 cpuid_mips.c   | 6 +++---
 cpuid_mips64.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpuid_mips.c b/cpuid_mips.c
index 1946455d8..d787e7120 100644
--- a/cpuid_mips.c
+++ b/cpuid_mips.c
@@ -165,7 +165,7 @@ void get_cpuconfig(void){
   }else{
     printf("#define UNKNOWN\n");
   }
-  if (!get_feature(msa)) printf("#define NO_MSA\n");
+  if (!get_feature("msa")) printf("#define NO_MSA\n");
 }
 
 void get_libname(void){
@@ -193,7 +193,7 @@ int get_feature(char *search)
         while (fgets(buffer, sizeof(buffer), infile))
         {
 
-                if (!strncmp("Features", buffer, 8))
+                if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
                 {
                         p = strchr(buffer, ':') + 2;
                         break;
@@ -207,7 +207,7 @@ int get_feature(char *search)
         t = strtok(p," ");
         while( t = strtok(NULL," "))
         {
-                if (!strcmp(t, search))   { return(1); }
+                if (strstr(t, search))   { return(1); }
         }
 
 #endif
diff --git a/cpuid_mips64.c b/cpuid_mips64.c
index 97743bc43..8753ee3f0 100644
--- a/cpuid_mips64.c
+++ b/cpuid_mips64.c
@@ -201,7 +201,7 @@ void get_cpuconfig(void){
     printf("#define DTB_SIZE 4096\n");
     printf("#define L2_ASSOCIATIVE 8\n");
   }
-  if (!get_feature(msa)) printf("#define NO_MSA\n");
+  if (!get_feature("msa")) printf("#define NO_MSA\n");
 }
 
 void get_libname(void){
@@ -233,7 +233,7 @@ int get_feature(char *search)
         while (fgets(buffer, sizeof(buffer), infile))
         {
 
-                if (!strncmp("Features", buffer, 8))
+                if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
                 {
                         p = strchr(buffer, ':') + 2;
                         break;
@@ -247,7 +247,7 @@ int get_feature(char *search)
         t = strtok(p," ");
         while( t = strtok(NULL," "))
         {
-                if (!strcmp(t, search))   { return(1); }
+                if (strstr(t, search))   { return(1); }
         }
 
 #endif

From e9a0e52201282ee1caec67475307aa7717b2bc31 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 24 Dec 2021 20:00:50 +0100
Subject: [PATCH 13/77] fix function typecast

---
 kernel/x86_64/casum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
index a1bd76f33..60feec0ce 100644
--- a/kernel/x86_64/casum.c
+++ b/kernel/x86_64/casum.c
@@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         mode = BLAS_DOUBLE | BLAS_COMPLEX;
 #endif
         blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
-                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+                NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);

From 7b146e590c1d93c62cb0a7590a3ca287bcde52c6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 24 Dec 2021 20:01:52 +0100
Subject: [PATCH 14/77] fix function typecast

---
 kernel/x86_64/zasum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c
index 6e758e2e3..80e95a2c8 100644
--- a/kernel/x86_64/zasum.c
+++ b/kernel/x86_64/zasum.c
@@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         mode = BLAS_DOUBLE | BLAS_COMPLEX;
 #endif
         blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
-                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+                NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);

From 683a7548bf34f610f5bdedfac5c1dac425c66a59 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sat, 25 Dec 2021 11:46:41 +0100
Subject: [PATCH 15/77] added macros for sve zgemm kernels

---
 kernel/arm64/zgemm_kernel_sve_v1x4.S | 1159 ++++++++++++++++++++++++++
 1 file changed, 1159 insertions(+)
 create mode 100644 kernel/arm64/zgemm_kernel_sve_v1x4.S

diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..0fc966f8c
--- /dev/null
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -0,0 +1,1159 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define alphaR		x19
+#define alphaI		x20
+
+#define alphaz_R	z10.d
+#define alphaz_I	z11.d
+#define alpha0_R	d10
+#define alphaV0_R	v10.d[0]
+#define alpha0_I	d11
+#define alphaV0_I	v11.d[0]
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+	dup		z20.d, #0
+	dup		z21.d, #0
+	dup		z22.d, #0
+	dup		z23.d, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	ld2d	{z2.d, z3.d}, p1/z, [pA, lanes, lsl #4] // next one
+	add	pA, pA, lanes, lsl #5    // pA += lanes*2*2*8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+	fmla	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.d, p1/m, z0.d, z9.d
+#else
+	fmla	z17.d, p1/m, z0.d, z9.d
+#endif
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+
+	fmla	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.d, p1/m, z0.d, z11.d
+#else
+	fmla	z19.d, p1/m, z0.d, z11.d
+#endif
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+
+	fmla	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.d, p1/m, z0.d, z13.d
+#else
+	fmla	z21.d, p1/m, z0.d, z13.d
+#endif
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+
+	fmla	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.d, p1/m, z0.d, z15.d
+#else
+	fmla	z23.d, p1/m, z0.d, z15.d
+#endif
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8
+
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+    ld1rd  z12.d, p0/z,  [pB, 32]
+    ld1rd  z13.d, p0/z,  [pB, 40]
+    ld1rd  z14.d, p0/z,  [pB, 48]
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 64
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z25.d, z26.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow0]
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ld2d	{z28.d, z29.d}, p1/z, [pCRow1]
+	fmla	z28.d, p1/m, z20.d, alphaz_R
+	fmls	z28.d, p1/m, z21.d, alphaz_I
+	fmla	z29.d, p1/m, z20.d, alphaz_I
+	fmla	z29.d, p1/m, z21.d, alphaz_R
+	st2d 	{z28.d, z29.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, #32
+
+	ld2d	{z30.d, z31.d}, p1/z, [pCRow1]
+	fmla	z30.d, p1/m, z22.d, alphaz_R
+	fmls	z30.d, p1/m, z23.d, alphaz_I
+	fmla	z31.d, p1/m, z22.d, alphaz_I
+	fmla	z31.d, p1/m, z23.d, alphaz_R
+	st2d 	{z30.d, z31.d}, p1, [pCRow1]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 32
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z25.d, z26.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow0]
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes * 2 *8
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.d, #0
+	dup		z17.d, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z25.d, z26.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, d0
+	fmov	alphaI, d1
+
+	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lzgemm_kernel_L2_BEGIN
+
+.Lzgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lzgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp 	counterI, #0
+	ble	.Lzgemm_kernel_L4_M2_BEGIN
+
+	.align 5
+.Lzgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3
+	cmp	counterL , #2
+	blt	.Lzgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lzgemm_kernel_L4_M4_22a
+
+	.align 5
+.Lzgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L4_M4_22
+
+	.align 5
+.Lzgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 .Lzgemm_kernel_L4_M4_44
+
+	.align 5
+.Lzgemm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	.Lzgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	.Lzgemm_kernel_L4_M4_44
+
+
+.Lzgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+.Lzgemm_kernel_L4_M4_44:
+
+	ands	counterL , origK, #7
+	ble	.Lzgemm_kernel_L4_M4_100
+
+	.align 5
+.Lzgemm_kernel_L4_M4_46:
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lzgemm_kernel_L4_M4_46
+
+.Lzgemm_kernel_L4_M4_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVE4x4
+
+.Lzgemm_kernel_L4_M4_END:
+	subs	counterI, counterI, #1
+	bne	.Lzgemm_kernel_L4_M4_20
+
+.Lzgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lzgemm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lzgemm_kernel_L4_M1_BEGIN
+
+.Lzgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lzgemm_kernel_L4_M2_40
+
+.Lzgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L4_M2_22
+
+
+.Lzgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L4_M2_100
+
+.Lzgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L4_M2_42
+
+.Lzgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+.Lzgemm_kernel_L4_M2_END:
+
+
+.Lzgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lzgemm_kernel_L4_END
+
+.Lzgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lzgemm_kernel_L4_M1_40
+
+.Lzgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L4_M1_22
+
+
+.Lzgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L4_M1_100
+
+.Lzgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L4_M1_42
+
+.Lzgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+
+.Lzgemm_kernel_L4_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lzgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lzgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lzgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lzgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp	counterI,#0
+	ble	.Lzgemm_kernel_L2_M2_BEGIN
+
+.Lzgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lzgemm_kernel_L2_M4_40
+	.align 5
+
+.Lzgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_M4_22
+
+
+.Lzgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L2_M4_100
+
+.Lzgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_M4_42
+
+.Lzgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+.Lzgemm_kernel_L2_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	.Lzgemm_kernel_L2_M4_20
+
+
+.Lzgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lzgemm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lzgemm_kernel_L2_M1_BEGIN
+
+.Lzgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	.Lzgemm_kernel_L2_M2_40
+
+.Lzgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_M2_22
+
+
+.Lzgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L2_M2_100
+
+.Lzgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_M2_42
+
+.Lzgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+.Lzgemm_kernel_L2_M2_END:
+
+
+.Lzgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lzgemm_kernel_L2_END
+
+.Lzgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	.Lzgemm_kernel_L2_M1_40
+
+.Lzgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_M1_22
+
+
+.Lzgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L2_M1_100
+
+.Lzgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_M1_42
+
+.Lzgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+
+.Lzgemm_kernel_L2_END:
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lzgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lzgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp	counterI, #0
+	ble	.Lzgemm_kernel_L1_M2_BEGIN
+
+.Lzgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lzgemm_kernel_L1_M4_40
+	.align 5
+
+.Lzgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_M4_22
+
+
+.Lzgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L1_M4_100
+
+.Lzgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_M4_42
+
+.Lzgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+.Lzgemm_kernel_L1_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	.Lzgemm_kernel_L1_M4_20
+
+
+.Lzgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lzgemm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lzgemm_kernel_L1_M1_BEGIN
+
+.Lzgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lzgemm_kernel_L1_M2_40
+
+.Lzgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_M2_22
+
+
+.Lzgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L1_M2_100
+
+.Lzgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_M2_42
+
+.Lzgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+.Lzgemm_kernel_L1_M2_END:
+
+
+.Lzgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lzgemm_kernel_L1_END
+
+.Lzgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lzgemm_kernel_L1_M1_40
+
+.Lzgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_M1_22
+
+
+.Lzgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L1_M1_100
+
+.Lzgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_M1_42
+
+.Lzgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+.Lzgemm_kernel_L1_END:
+
+
+.Lzgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+

From 878064f39463631e0daf78395248083f1c8b251f Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sun, 26 Dec 2021 08:44:05 +0100
Subject: [PATCH 16/77] sve zgemm kernel

---
 kernel/arm64/zgemm_kernel_sve_v1x4.S | 544 +++++++--------------------
 1 file changed, 132 insertions(+), 412 deletions(-)

diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
index 0fc966f8c..1201d6dac 100644
--- a/kernel/arm64/zgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -48,6 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow2		x14
 #define pCRow3		x15
 #define pA		x16
+#define lanes		x17
+
 #define alphaR		x19
 #define alphaI		x20
 
@@ -168,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNELv1x4_I
 	ld2d	{z0.d, z1.d}, p1/z, [pA]
-	ld2d	{z2.d, z3.d}, p1/z, [pA, lanes, lsl #4] // next one
+	ld2d	{z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one
 	add	pA, pA, lanes, lsl #5    // pA += lanes*2*2*8
 
     ld1rd  z8.d, p0/z,  [pB]
@@ -561,17 +563,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prfm	PLDL1KEEP, [origPA]
 
 	fmov	alphaR, d0
+	dup	    alphaz_R, alphaR
 	fmov	alphaI, d1
+	dup	    alphaz_I, alphaI
 
 	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8
+    ptrue p0.d                  // create true predicate 
 
 	mov	pB, origPB
 
+// Loop over N
 	mov	counterJ, origN
 	asr 	counterJ, counterJ, #2		// J = J / 4
 	cmp 	counterJ, #0
 	ble	.Lzgemm_kernel_L2_BEGIN
 
+/******************************************************************************/
 .Lzgemm_kernel_L4_BEGIN:
 	mov	pCRow0, pC
 	add	pCRow1, pCRow0, LDC
@@ -582,204 +589,112 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	mov	pA, origPA			// pA = start of A array
 
-.Lzgemm_kernel_L4_M4_BEGIN:
+.Lzgemm_kernel_L4_Mv1_BEGIN:
 
-	mov	counterI, origM
-	asr 	counterI, counterI, #2		// counterI = counterI / 4
-	cmp 	counterI, #0
-	ble	.Lzgemm_kernel_L4_M2_BEGIN
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM   
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
 
 	.align 5
-.Lzgemm_kernel_L4_M4_20:
+.Lzgemm_kernel_L4_Mv1_20:
 
 	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
 	asr 	counterL , origK, #3
 	cmp	counterL , #2
-	blt	.Lzgemm_kernel_L4_M4_32
+	blt	.Lzgemm_kernel_L4_Mv1_32
 
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
 
 	subs	counterL, counterL, #2		// subtract 2
-	ble	.Lzgemm_kernel_L4_M4_22a
+	ble	.Lzgemm_kernel_L4_Mv1_22a
 
 	.align 5
-.Lzgemm_kernel_L4_M4_22:
+.Lzgemm_kernel_L4_Mv1_22:
 
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
 
 	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L4_M4_22
+	bgt	.Lzgemm_kernel_L4_Mv1_22
 
 	.align 5
-.Lzgemm_kernel_L4_M4_22a:
+.Lzgemm_kernel_L4_Mv1_22a:
 
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
 
-	b	 .Lzgemm_kernel_L4_M4_44
+	b	 .Lzgemm_kernel_L4_Mv1_44
 
 	.align 5
-.Lzgemm_kernel_L4_M4_32:
+.Lzgemm_kernel_L4_Mv1_32:
 
 	tst	counterL, #1
-	ble	.Lzgemm_kernel_L4_M4_40
+	ble	.Lzgemm_kernel_L4_Mv1_40
 
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
 
-	b	.Lzgemm_kernel_L4_M4_44
+	b	.Lzgemm_kernel_L4_Mv1_44
 
 
-.Lzgemm_kernel_L4_M4_40:
+.Lzgemm_kernel_L4_Mv1_40:
 
-	INIT4x4
+	INITv1x4
 
-.Lzgemm_kernel_L4_M4_44:
+.Lzgemm_kernel_L4_Mv1_44:
 
 	ands	counterL , origK, #7
-	ble	.Lzgemm_kernel_L4_M4_100
+	ble	.Lzgemm_kernel_L4_Mv1_100
 
 	.align 5
-.Lzgemm_kernel_L4_M4_46:
-	KERNEL4x4_SUB
+.Lzgemm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
 
 	subs	counterL, counterL, #1
-	bne	.Lzgemm_kernel_L4_M4_46
+	bne	.Lzgemm_kernel_L4_Mv1_46
 
-.Lzgemm_kernel_L4_M4_100:
+.Lzgemm_kernel_L4_Mv1_100:
 	prfm	PLDL1KEEP, [pA]
 	prfm	PLDL1KEEP, [pA, #64]
 	prfm	PLDL1KEEP, [origPB]
 
-	SAVE4x4
+	SAVEv1x4
 
-.Lzgemm_kernel_L4_M4_END:
-	subs	counterI, counterI, #1
-	bne	.Lzgemm_kernel_L4_M4_20
+.Lzgemm_kernel_L4_Mv1_END:
 
-.Lzgemm_kernel_L4_M2_BEGIN:
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lzgemm_kernel_L4_Mv1_20   
 
-	mov	counterI, origM
-	tst	counterI , #3
-	ble	.Lzgemm_kernel_L4_END
-
-	tst	counterI, #2			// counterI = counterI / 2
-	ble	.Lzgemm_kernel_L4_M1_BEGIN
-
-.Lzgemm_kernel_L4_M2_20:
-
-	INIT2x4
-
-	mov	pB, origPB
-	asr 	counterL , origK, #3		// counterL = counterL / 8
-	cmp	counterL , #0
-	ble	.Lzgemm_kernel_L4_M2_40
-
-.Lzgemm_kernel_L4_M2_22:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L4_M2_22
-
-
-.Lzgemm_kernel_L4_M2_40:
-
-	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L4_M2_100
-
-.Lzgemm_kernel_L4_M2_42:
-
-	KERNEL2x4_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L4_M2_42
-
-.Lzgemm_kernel_L4_M2_100:
-
-	SAVE2x4
-
-.Lzgemm_kernel_L4_M2_END:
-
-
-.Lzgemm_kernel_L4_M1_BEGIN:
-
-	tst	counterI, #1			// counterI = counterI % 2
-	ble	.Lzgemm_kernel_L4_END
-
-.Lzgemm_kernel_L4_M1_20:
-
-	INIT1x4
-
-	mov	pB, origPB
-	asr 	counterL , origK, #3		// counterL = counterL / 8
-	cmp	counterL , #0
-	ble	.Lzgemm_kernel_L4_M1_40
-
-.Lzgemm_kernel_L4_M1_22:
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L4_M1_22
-
-
-.Lzgemm_kernel_L4_M1_40:
-
-	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L4_M1_100
-
-.Lzgemm_kernel_L4_M1_42:
-
-	KERNEL1x4_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L4_M1_42
-
-.Lzgemm_kernel_L4_M1_100:
-
-	SAVE1x4
 
 
 .Lzgemm_kernel_L4_END:
@@ -810,157 +725,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 
-.Lzgemm_kernel_L2_M4_BEGIN:
+.Lzgemm_kernel_L2_Mv1_BEGIN:
 
-	mov	counterI, origM
-	asr 	counterI, counterI, #2		// counterI = counterI / 4
-	cmp	counterI,#0
-	ble	.Lzgemm_kernel_L2_M2_BEGIN
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
 
-.Lzgemm_kernel_L2_M4_20:
 
-	INIT4x2
+.Lzgemm_kernel_L2_Mv1_20:
+
+	INITv1x2
 
 	mov	pB, origPB
 	asr	counterL , origK, #3		// counterL = counterL / 8
 	cmp	counterL,#0
-	ble	.Lzgemm_kernel_L2_M4_40
+	ble	.Lzgemm_kernel_L2_Mv1_40
 	.align 5
 
-.Lzgemm_kernel_L2_M4_22:
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
+.Lzgemm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
 
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
 
 	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L2_M4_22
+	bgt	.Lzgemm_kernel_L2_Mv1_22
 
 
-.Lzgemm_kernel_L2_M4_40:
+.Lzgemm_kernel_L2_Mv1_40:
 
 	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L2_M4_100
+	ble	.Lzgemm_kernel_L2_Mv1_100
 
-.Lzgemm_kernel_L2_M4_42:
+.Lzgemm_kernel_L2_Mv1_42:
 
-	KERNEL4x2_SUB
+	KERNELv1x2_SUB
 
 	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L2_M4_42
+	bgt	.Lzgemm_kernel_L2_Mv1_42
 
-.Lzgemm_kernel_L2_M4_100:
+.Lzgemm_kernel_L2_Mv1_100:
 
-	SAVE4x2
+	SAVEv1x2
 
-.Lzgemm_kernel_L2_M4_END:
-
-	subs	counterI, counterI, #1
-	bgt	.Lzgemm_kernel_L2_M4_20
+.Lzgemm_kernel_L2_Mv1_END:
 
 
-.Lzgemm_kernel_L2_M2_BEGIN:
-
-	mov	counterI, origM
-	tst	counterI , #3
-	ble	.Lzgemm_kernel_L2_END
-
-	tst	counterI, #2			// counterI = counterI / 2
-	ble	.Lzgemm_kernel_L2_M1_BEGIN
-
-.Lzgemm_kernel_L2_M2_20:
-
-	INIT2x2
-
-	mov	pB, origPB
-	asr	counterL , origK, #3		// counterL = counterL / 8
-        cmp	counterL,#0
-	ble	.Lzgemm_kernel_L2_M2_40
-
-.Lzgemm_kernel_L2_M2_22:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L2_M2_22
-
-
-.Lzgemm_kernel_L2_M2_40:
-
-	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L2_M2_100
-
-.Lzgemm_kernel_L2_M2_42:
-
-	KERNEL2x2_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L2_M2_42
-
-.Lzgemm_kernel_L2_M2_100:
-
-	SAVE2x2
-
-.Lzgemm_kernel_L2_M2_END:
-
-
-.Lzgemm_kernel_L2_M1_BEGIN:
-
-	tst	counterI, #1			// counterI = counterI % 2
-	ble	.Lzgemm_kernel_L2_END
-
-.Lzgemm_kernel_L2_M1_20:
-
-	INIT1x2
-
-	mov	pB, origPB
-	asr 	counterL , origK, #3		// counterL = counterL / 8
-        cmp     counterL, #0
-	ble	.Lzgemm_kernel_L2_M1_40
-
-.Lzgemm_kernel_L2_M1_22:
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L2_M1_22
-
-
-.Lzgemm_kernel_L2_M1_40:
-
-	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L2_M1_100
-
-.Lzgemm_kernel_L2_M1_42:
-
-	KERNEL1x2_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L2_M1_42
-
-.Lzgemm_kernel_L2_M1_100:
-
-	SAVE1x2
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lzgemm_kernel_L2_Mv1_20   
 
 
 .Lzgemm_kernel_L2_END:
@@ -981,163 +800,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	mov	pA, origPA			// pA = A
 
+.Lzgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
 
 
-.Lzgemm_kernel_L1_M4_BEGIN:
+.Lzgemm_kernel_L1_Mv1_20:
 
-	mov	counterI, origM
-	asr 	counterI, counterI, #2		// counterI = counterI / 4
-	cmp	counterI, #0
-	ble	.Lzgemm_kernel_L1_M2_BEGIN
-
-.Lzgemm_kernel_L1_M4_20:
-
-	INIT4x1
+	INITv1x1
 
 	mov	pB, origPB
 	asr	counterL , origK, #3		// counterL = counterL / 8
 	cmp	counterL , #0
-	ble	.Lzgemm_kernel_L1_M4_40
+	ble	.Lzgemm_kernel_L1_Mv1_40
 	.align 5
 
-.Lzgemm_kernel_L1_M4_22:
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
+.Lzgemm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
 
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
 
 	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L1_M4_22
+	bgt	.Lzgemm_kernel_L1_Mv1_22
 
 
-.Lzgemm_kernel_L1_M4_40:
+.Lzgemm_kernel_L1_Mv1_40:
 
 	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L1_M4_100
+	ble	.Lzgemm_kernel_L1_Mv1_100
 
-.Lzgemm_kernel_L1_M4_42:
+.Lzgemm_kernel_L1_Mv1_42:
 
-	KERNEL4x1_SUB
+	KERNELv1x1_SUB
 
 	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L1_M4_42
+	bgt	.Lzgemm_kernel_L1_Mv1_42
 
-.Lzgemm_kernel_L1_M4_100:
+.Lzgemm_kernel_L1_Mv1_100:
 
-	SAVE4x1
+	SAVEv1x1
 
-.Lzgemm_kernel_L1_M4_END:
-
-	subs	counterI, counterI, #1
-	bgt	.Lzgemm_kernel_L1_M4_20
-
-
-.Lzgemm_kernel_L1_M2_BEGIN:
-
-	mov	counterI, origM
-	tst	counterI , #3
-	ble	.Lzgemm_kernel_L1_END
-
-	tst	counterI, #2			// counterI = counterI / 2
-	ble	.Lzgemm_kernel_L1_M1_BEGIN
-
-.Lzgemm_kernel_L1_M2_20:
-
-	INIT2x1
-
-	mov	pB, origPB
-	asr 	counterL , origK, #3		// counterL = counterL / 8
-	cmp	counterL , #0
-	ble	.Lzgemm_kernel_L1_M2_40
-
-.Lzgemm_kernel_L1_M2_22:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L1_M2_22
-
-
-.Lzgemm_kernel_L1_M2_40:
-
-	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L1_M2_100
-
-.Lzgemm_kernel_L1_M2_42:
-
-	KERNEL2x1_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L1_M2_42
-
-.Lzgemm_kernel_L1_M2_100:
-
-	SAVE2x1
-
-.Lzgemm_kernel_L1_M2_END:
-
-
-.Lzgemm_kernel_L1_M1_BEGIN:
-
-	tst	counterI, #1			// counterI = counterI % 2
-	ble	.Lzgemm_kernel_L1_END
-
-.Lzgemm_kernel_L1_M1_20:
-
-	INIT1x1
-
-	mov	pB, origPB
-	asr 	counterL , origK, #3		// counterL = counterL / 8
-	cmp	counterL , #0
-	ble	.Lzgemm_kernel_L1_M1_40
-
-.Lzgemm_kernel_L1_M1_22:
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L1_M1_22
-
-
-.Lzgemm_kernel_L1_M1_40:
-
-	ands	counterL , origK, #7		// counterL = counterL % 8
-	ble	.Lzgemm_kernel_L1_M1_100
-
-.Lzgemm_kernel_L1_M1_42:
-
-	KERNEL1x1_SUB
-
-	subs	counterL, counterL, #1
-	bgt	.Lzgemm_kernel_L1_M1_42
-
-.Lzgemm_kernel_L1_M1_100:
-
-	SAVE1x1
+.Lzgemm_kernel_L1_Mv1_END:
 
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lzgemm_kernel_L1_Mv1_20   
 
 .Lzgemm_kernel_L1_END:
 
+/******************************************************************************/
 
 .Lzgemm_kernel_L999:
 	mov	x0, #0				// set return value

From 6ec4aab8754b4c0fa5a6dd359fe56ee755e04ee3 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sun, 26 Dec 2021 17:05:46 +0100
Subject: [PATCH 17/77] zgemm sve copy routines

---
 kernel/arm64/zgemm_ncopy_sve_v1.c | 80 +++++++++++++++++++++++++++++++
 kernel/arm64/zgemm_tcopy_sve_v1.c | 77 +++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 kernel/arm64/zgemm_ncopy_sve_v1.c
 create mode 100644 kernel/arm64/zgemm_tcopy_sve_v1.c

diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..be18e9708
--- /dev/null
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -0,0 +1,80 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint64_t lda_vec = svindex_s64(0LL, lda * 2);
+    uint64_t sve_size = svcntd();
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
+            svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec);
+            svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag));
+            aoffset1 += 2;
+            boffset += active;
+        }
+        aoffset += sve_size * lda * 2;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..085e1fa40
--- /dev/null
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -0,0 +1,77 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    uint64_t sve_size = svcntd();
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1);
+            svst2_f64(pg, (double *) boffset, a_vec);
+            aoffset1 += lda * 2;
+            boffset += active * 2;
+        }
+        aoffset += sve_size * 2;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}

From 6cae44d4f7afc6352a6521e717eff80f0220aded Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 28 Dec 2021 19:06:55 +0100
Subject: [PATCH 18/77] Ensure that the right xerbla gets included in OSX
 DYNAMIC_ARCH builds

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 913017c63..decd8cc2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -251,12 +251,14 @@ if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
   set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
  "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
+ "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
  "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
   else ()
   set (CMAKE_C_CREATE_SHARED_LIBRARY
    "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
+   "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
    "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
   endif ()
 endif()

From 40b14e4957b9a5d9bbda30fc10aeeba485755f3c Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 29 Dec 2021 11:42:04 +0100
Subject: [PATCH 19/77] fix zgemm kernel

---
 kernel/arm64/zgemm_kernel_sve_v1x4.S | 59 +++++++++++++---------------
 kernel/arm64/zgemm_ncopy_sve_v1.c    |  2 +-
 kernel/arm64/zgemm_tcopy_sve_v1.c    |  2 +-
 3 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
index 1201d6dac..d5b35775c 100644
--- a/kernel/arm64/zgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -53,12 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alphaR		x19
 #define alphaI		x20
 
-#define alphaz_R	z10.d
-#define alphaz_I	z11.d
-#define alpha0_R	d10
-#define alphaV0_R	v10.d[0]
-#define alpha0_I	d11
-#define alphaV0_I	v11.d[0]
+#define alphaz_R	z6.d
+#define alphaz_I	z7.d
+#define alpha0_R	d6
+#define alpha0_I	d7
 
 
 #define A_PRE_SIZE	2560
@@ -170,8 +168,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNELv1x4_I
 	ld2d	{z0.d, z1.d}, p1/z, [pA]
-	ld2d	{z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one
-	add	pA, pA, lanes, lsl #5    // pA += lanes*2*2*8
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+	ld2d	{z2.d, z3.d}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
 
     ld1rd  z8.d, p0/z,  [pB]
     ld1rd  z9.d, p0/z,  [pB, 8]
@@ -283,7 +282,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNELv1x4_M2
-	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8
 
 	OP_rr	z16.d, p1/m, z2.d, z8.d
@@ -396,39 +395,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmls	z24.d, p1/m, z17.d, alphaz_I
 	fmla	z25.d, p1/m, z16.d, alphaz_I
 	fmla	z25.d, p1/m, z17.d, alphaz_R
-	st2d 	{z25.d, z26.d}, p1, [pCRow0]
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
 
-	add	pCRow0, pCRow0, #32
+	add	pCRow0, pCRow0, lanes, lsl #4
 
-	ld2d	{z26.d, z27.d}, p1/z, [pCRow0]
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow1]
 	fmla	z26.d, p1/m, z18.d, alphaz_R
 	fmls	z26.d, p1/m, z19.d, alphaz_I
 	fmla	z27.d, p1/m, z18.d, alphaz_I
 	fmla	z27.d, p1/m, z19.d, alphaz_R
-	st2d 	{z26.d, z27.d}, p1, [pCRow0]
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
-	add	pCRow0, pCRow0, #32
+	add	pCRow1, pCRow1, lanes, lsl #4
 	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
 
-	ld2d	{z28.d, z29.d}, p1/z, [pCRow1]
+	ld2d	{z28.d, z29.d}, p1/z, [pCRow2]
 	fmla	z28.d, p1/m, z20.d, alphaz_R
 	fmls	z28.d, p1/m, z21.d, alphaz_I
 	fmla	z29.d, p1/m, z20.d, alphaz_I
 	fmla	z29.d, p1/m, z21.d, alphaz_R
-	st2d 	{z28.d, z29.d}, p1, [pCRow1]
+	st2d 	{z28.d, z29.d}, p1, [pCRow2]
 
-	add	pCRow1, pCRow1, #32
+	add	pCRow2, pCRow2, lanes, lsl #4
 
-	ld2d	{z30.d, z31.d}, p1/z, [pCRow1]
+	ld2d	{z30.d, z31.d}, p1/z, [pCRow3]
 	fmla	z30.d, p1/m, z22.d, alphaz_R
 	fmls	z30.d, p1/m, z23.d, alphaz_I
 	fmla	z31.d, p1/m, z22.d, alphaz_I
 	fmla	z31.d, p1/m, z23.d, alphaz_R
-	st2d 	{z30.d, z31.d}, p1, [pCRow1]
+	st2d 	{z30.d, z31.d}, p1, [pCRow3]
 
-	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
 
-	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
 	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
 
@@ -474,24 +473,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmls	z24.d, p1/m, z17.d, alphaz_I
 	fmla	z25.d, p1/m, z16.d, alphaz_I
 	fmla	z25.d, p1/m, z17.d, alphaz_R
-	st2d 	{z25.d, z26.d}, p1, [pCRow0]
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
 
-	add	pCRow0, pCRow0, #32
+	add	pCRow0, pCRow0, lanes, lsl #4
 
-	ld2d	{z26.d, z27.d}, p1/z, [pCRow0]
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow1]
 	fmla	z26.d, p1/m, z18.d, alphaz_R
 	fmls	z26.d, p1/m, z19.d, alphaz_I
 	fmla	z27.d, p1/m, z18.d, alphaz_I
 	fmla	z27.d, p1/m, z19.d, alphaz_R
-	st2d 	{z26.d, z27.d}, p1, [pCRow0]
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
-	add	pCRow0, pCRow0, #32
+	add	pCRow1, pCRow1, lanes, lsl #4
 	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
 
 	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
 
-	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes * 2 *8
-
 .endm
 
 /******************************************************************************/
@@ -526,10 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmls	z24.d, p1/m, z17.d, alphaz_I
 	fmla	z25.d, p1/m, z16.d, alphaz_I
 	fmla	z25.d, p1/m, z17.d, alphaz_R
-	st2d 	{z25.d, z26.d}, p1, [pCRow0]
-
-	add	pCRow0, pCRow0, #32
-
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
 
 	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
@@ -718,6 +712,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ble	.Lzgemm_kernel_L1_BEGIN
 
 	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
 
 	add	pC,pC,LDC, lsl #1
 
diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
index be18e9708..57035f4ff 100644
--- a/kernel/arm64/zgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
             svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec);
             svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag));
             aoffset1 += 2;
-            boffset += active;
+            boffset += active * 2;
         }
         aoffset += sve_size * lda * 2;
 
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
index 085e1fa40..32f217d7a 100644
--- a/kernel/arm64/zgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
             aoffset1 += lda * 2;
             boffset += active * 2;
         }
-        aoffset += sve_size * 2;
+        aoffset += active * 2;
 
         j += svcntd();
         pg = svwhilelt_b64(j, n);

From ea3db69faa99dc4f7ad6641c9590f77ace7d6b03 Mon Sep 17 00:00:00 2001
From: jgillis <joris.gillis42@gmail.com>
Date: Wed, 29 Dec 2021 22:50:20 +0100
Subject: [PATCH 20/77] Fix cmake crosscompilation for core2 target

Missing HAVE_SSE* cmake variables cause cc.cmake to forget about `-msse*` flags
---
 cmake/prebuild.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 259d9c738..232a6cc35 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       "#define DLOCAL_BUFFER_SIZE\t16384\n"
       "#define CLOCAL_BUFFER_SIZE\t16384\n"
       "#define ZLOCAL_BUFFER_SIZE\t16384\n")
+      set(HAVE_SSE 1)
+      set(HAVE_SSE2 1)
+      set(HAVE_SSE3 1)
+      set(HAVE_SSSE3 1)
       set(SGEMM_UNROLL_M 8)
       set(SGEMM_UNROLL_N 4)
       set(DGEMM_UNROLL_M 4)

From f7b69128680323ae30ff5992c2ea9f7cc8db8973 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Thu, 30 Dec 2021 21:00:16 +0100
Subject: [PATCH 21/77] ztrmm sve copy kernels

---
 kernel/arm64/ztrmm_lncopy_sve_v1.c | 145 +++++++++++++++++++++++++++++
 kernel/arm64/ztrmm_ltcopy_sve_v1.c | 143 ++++++++++++++++++++++++++++
 kernel/arm64/ztrmm_uncopy_sve_v1.c | 145 +++++++++++++++++++++++++++++
 kernel/arm64/ztrmm_utcopy_sve_v1.c | 141 ++++++++++++++++++++++++++++
 4 files changed, 574 insertions(+)
 create mode 100644 kernel/arm64/ztrmm_lncopy_sve_v1.c
 create mode 100644 kernel/arm64/ztrmm_ltcopy_sve_v1.c
 create mode 100644 kernel/arm64/ztrmm_uncopy_sve_v1.c
 create mode 100644 kernel/arm64/ztrmm_utcopy_sve_v1.c

diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
new file mode 100644
index 000000000..19c34ff41
--- /dev/null
+++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda*2);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda*2);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
new file mode 100644
index 000000000..c272db602
--- /dev/null
+++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
@@ -0,0 +1,143 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                    }
+#endif
+                    ao += n_active * lda * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
new file mode 100644
index 000000000..aaa217063
--- /dev/null
+++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda * 2);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda * 2);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
new file mode 100644
index 000000000..c3e1f1b42
--- /dev/null
+++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c
@@ -0,0 +1,141 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else { 
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * lda * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}

From b329e45288c2e7fc0ef15c4e8a7b3c8dfd74a930 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 1 Jan 2022 00:46:23 +0100
Subject: [PATCH 22/77] Guard against omp_get_num_places returning zero

---
 driver/others/memory.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index bd0553ca9..0f4cbb24d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -232,11 +232,11 @@ int get_num_procs(void);
 #else
 int get_num_procs(void) {
   static int nums = 0;
-
+  int ret;
 #if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
-  int ret;
+
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)
@@ -249,7 +249,8 @@ int get_num_procs(void) {
 
 #if defined(USE_OPENMP)
 #if _OPENMP >= 201511
-    nums = omp_get_num_places();
+    ret = omp_get_num_places();
+    if (ret >0 ) nums = ret;
 #endif
     return nums;
 #endif
@@ -1800,11 +1801,12 @@ int get_num_procs(void);
 int get_num_procs(void) {
 
   static int nums = 0;
-
+  int ret;
+	
 #if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
-  int ret;
+
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)
@@ -1818,7 +1820,8 @@ int get_num_procs(void) {
 #if defined(USE_OPENMP)
 /*  if (omp_get_proc_bind() != omp_proc_bind_false) */
 #if _OPENMP >= 201511
-    nums = omp_get_num_places();	  
+    ret = omp_get_num_places();
+    if (ret >0 ) nums = ret;
 #endif
     return nums;
 #endif

From 0140373802db2d910baa92bc7b31dba076fc205b Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sun, 2 Jan 2022 19:15:33 +0100
Subject: [PATCH 23/77] add sve ztrmm

---
 kernel/Makefile.L3                   |   32 +
 kernel/arm64/KERNEL.A64FX            |   12 +-
 kernel/arm64/ztrmm_kernel_sve_v1x4.S | 1006 ++++++++++++++++++++++++++
 3 files changed, 1044 insertions(+), 6 deletions(-)
 create mode 100644 kernel/arm64/ztrmm_kernel_sve_v1x4.S

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index d22bd46a5..da279b185 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -1739,29 +1739,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_
 $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRMMUNCOPY_M
+$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMLNCOPY_M
+$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMUTCOPY_M
+$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMLTCOPY_M
+$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index 04be0fab9..986b7ab47 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -182,11 +182,11 @@ ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
 
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
+ZSYMMUCOPY_M    =  symm_ucopy_sve.c
+ZSYMMLCOPY_M    =  symm_lcopy_sve.c
 
diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..1a81b4da0
--- /dev/null
+++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
@@ -0,0 +1,1006 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		x19
+#define alphaI		x20
+#define temp		x21
+#define tempOffset	x22
+#define tempK		x23
+
+#define alphaz_R	z6.d
+#define alphaz_I	z7.d
+#define alpha0_R	d6
+#define alpha0_I	d7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+	dup		z20.d, #0
+	dup		z21.d, #0
+	dup		z22.d, #0
+	dup		z23.d, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+	ld2d	{z2.d, z3.d}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+	fmla	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.d, p1/m, z0.d, z9.d
+#else
+	fmla	z17.d, p1/m, z0.d, z9.d
+#endif
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+
+	fmla	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.d, p1/m, z0.d, z11.d
+#else
+	fmla	z19.d, p1/m, z0.d, z11.d
+#endif
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+
+	fmla	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.d, p1/m, z0.d, z13.d
+#else
+	fmla	z21.d, p1/m, z0.d, z13.d
+#endif
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+
+	fmla	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.d, p1/m, z0.d, z15.d
+#else
+	fmla	z23.d, p1/m, z0.d, z15.d
+#endif
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8
+
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+    ld1rd  z12.d, p0/z,  [pB, 32]
+    ld1rd  z13.d, p0/z,  [pB, 40]
+    ld1rd  z14.d, p0/z,  [pB, 48]
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 64
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	eor	z28.d, z16.d, z16.d
+	eor	z29.d, z16.d, z16.d
+	fmla	z28.d, p1/m, z20.d, alphaz_R
+	fmls	z28.d, p1/m, z21.d, alphaz_I
+	fmla	z29.d, p1/m, z20.d, alphaz_I
+	fmla	z29.d, p1/m, z21.d, alphaz_R
+	st2d 	{z28.d, z29.d}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #4
+
+	eor	z30.d, z16.d, z16.d
+	eor	z31.d, z16.d, z16.d
+	fmla	z30.d, p1/m, z22.d, alphaz_R
+	fmls	z30.d, p1/m, z23.d, alphaz_I
+	fmla	z31.d, p1/m, z22.d, alphaz_I
+	fmla	z31.d, p1/m, z23.d, alphaz_R
+	st2d 	{z30.d, z31.d}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 32
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.d, #0
+	dup		z17.d, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, d0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, d1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8
+    ptrue p0.d                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lztrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lztrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+.Lztrmm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM   
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #6
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3
+	cmp	counterL , #2
+	blt	.Lztrmm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lztrmm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L4_Mv1_22
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lztrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lztrmm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lztrmm_kernel_L4_Mv1_44
+
+
+.Lztrmm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lztrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lztrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lztrmm_kernel_L4_Mv1_46
+
+.Lztrmm_kernel_L4_Mv1_100:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #6
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lztrmm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lztrmm_kernel_L4_Mv1_20   
+
+
+
+.Lztrmm_kernel_L4_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lztrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lztrmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lztrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lztrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lztrmm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lztrmm_kernel_L2_Mv1_40
+	.align 5
+
+.Lztrmm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L2_Mv1_22
+
+
+.Lztrmm_kernel_L2_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lztrmm_kernel_L2_Mv1_100
+
+.Lztrmm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L2_Mv1_42
+
+.Lztrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+.Lztrmm_kernel_L2_Mv1_END:
+
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lztrmm_kernel_L2_Mv1_20   
+
+
+.Lztrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lztrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+.Lztrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lztrmm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , temp, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lztrmm_kernel_L1_Mv1_40
+	.align 5
+
+.Lztrmm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L1_Mv1_22
+
+
+.Lztrmm_kernel_L1_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lztrmm_kernel_L1_Mv1_100
+
+.Lztrmm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L1_Mv1_42
+
+.Lztrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+.Lztrmm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lztrmm_kernel_L1_Mv1_20   
+
+.Lztrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+

From ce329ab6869bd958cde05c1dcd39ce7c6bc02cd9 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Mon, 3 Jan 2022 15:56:05 +0100
Subject: [PATCH 24/77] add sve zhemm copy routines

---
 kernel/arm64/KERNEL.A64FX       |   4 +-
 kernel/arm64/zhemm_ltcopy_sve.c | 106 +++++++++++++++++++++++++++++++
 kernel/arm64/zhemm_utcopy_sve.c | 107 ++++++++++++++++++++++++++++++++
 3 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 kernel/arm64/zhemm_ltcopy_sve.c
 create mode 100644 kernel/arm64/zhemm_utcopy_sve.c

diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index 986b7ab47..ff5d3aa0e 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -187,6 +187,6 @@ ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
 ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
 ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
 
-ZSYMMUCOPY_M    =  symm_ucopy_sve.c
-ZSYMMLCOPY_M    =  symm_lcopy_sve.c
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
 
diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
new file mode 100644
index 000000000..58e9ff589
--- /dev/null
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -0,0 +1,106 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG offset, i;
+
+  lda *= 2;
+
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint64_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+  return 0;
+}
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
new file mode 100644
index 000000000..9ddbf6cbd
--- /dev/null
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -0,0 +1,107 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG offset, i;
+
+  lda *= 2;
+
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, lda);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint64_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+        data_vec_imag = svneg_z(pg, data_vec_imag);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+  return 0;
+}

From 68c414d3a6d9af7f8a686868feeddcd237977b05 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Tue, 4 Jan 2022 14:40:59 +0100
Subject: [PATCH 25/77] ztrmm sve copy functions

---
 kernel/arm64/ztrmm_lncopy_sve_v1.c | 14 +++++++-------
 kernel/arm64/ztrmm_ltcopy_sve_v1.c | 12 ++++++------
 kernel/arm64/ztrmm_uncopy_sve_v1.c | 14 +++++++-------
 kernel/arm64/ztrmm_utcopy_sve_v1.c | 12 ++++++------
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
index 19c34ff41..d34f607ab 100644
--- a/kernel/arm64/ztrmm_lncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c
@@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     js = 0;
     FLOAT *ao;
 #ifdef DOUBLE
-    svint64_t index = svindex_s64(0LL, lda*2);
+    svint64_t index = svindex_s64(0LL, lda);
     svbool_t pn = svwhilelt_b64(js, n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svint32_t index = svindex_s32(0, lda*2);
+    svint32_t index = svindex_s32(0, lda);
     svbool_t pn = svwhilelt_b32(js, n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 i ++;
             } else 
                 if (X < posY) {
-                    ao += lda * 2;
+                    ao += lda;
                     b += n_active * 2;
                     X ++;
                     i ++;
@@ -99,8 +99,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     int temp = 0;
                     for (int j = 0; j < n_active; j++) {
                         for (int k = 0 ; k < j; k++) {
-                            b[temp++] = *(ao+k*lda+j);
-                            b[temp++] = *(ao+k*lda+j+1);
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
                         }
                         b[temp++] = ONE;
                         b[temp++] = ZERO;
@@ -113,8 +113,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     int temp = 0;
                     for (int j = 0; j < n_active; j++) {
                         for (int k = 0 ; k <= j; k++) {
-                            b[temp++] = *(ao+k*lda+j);
-                            b[temp++] = *(ao+k*lda+j+1);
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
                         }
                         for (int k = j+1; k < n_active; k++) {
                             b[temp++] = ZERO;
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
index c272db602..7f34c9857 100644
--- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
@@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     svfloat32x2_t aj_vec = svld2(pn, ao);
 #endif
                     svst2(pn, b, aj_vec);
-                    ao += lda * 2;
+                    ao += lda;
                     b += n_active * 2;
                     X ++;
                     i ++;
@@ -101,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                         b[temp++] = ONE;
                         b[temp++] = ZERO;
                         for (int k = j+1; k < n_active; k++) {
-                            b[temp++] = *(ao+j*lda+k);
-                            b[temp++] = *(ao+j*lda+k+1);
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
                         }
                     }
 #else 
@@ -113,12 +113,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                             b[temp++] = ZERO;
                         }
                         for (int k = j; k < n_active; k++) {
-                            b[temp++] = *(ao+j*lda+k);
-                            b[temp++] = *(ao+j*lda+k+1);
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
                         }
                     }
 #endif
-                    ao += n_active * lda * 2;
+                    ao += n_active * lda;
                     b += n_active*n_active * 2;
                     X += n_active;
                     i += n_active;
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
index aaa217063..7eb9452c9 100644
--- a/kernel/arm64/ztrmm_uncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c
@@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     js = 0;
     FLOAT *ao;
 #ifdef DOUBLE
-    svint64_t index = svindex_s64(0LL, lda * 2);
+    svint64_t index = svindex_s64(0LL, lda);
     svbool_t pn = svwhilelt_b64(js, n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svint32_t index = svindex_s32(0, lda * 2);
+    svint32_t index = svindex_s32(0, lda);
     svbool_t pn = svwhilelt_b32(js, n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 i ++;
             } else 
                 if (X > posY) {
-                    ao += lda * 2;
+                    ao += lda;
                     b += n_active * 2;
                     X ++;
                     i ++;
@@ -105,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                         b[temp++] = ONE;
                         b[temp++] = ZERO;
                         for (int k = j+1; k < n_active; k++) {
-                            b[temp++] = *(ao+k*lda+j);
-                            b[temp++] = *(ao+k*lda+j+1);
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
                         }
                     }
 #else 
@@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                             b[temp++] = ZERO;
                         }
                         for (int k = j; k < n_active; k++) {
-                            b[temp++] = *(ao+k*lda+j);
-                            b[temp++] = *(ao+k*lda+j+1);
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
                         }
                     }
 #endif
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
index c3e1f1b42..60c8ff3b4 100644
--- a/kernel/arm64/ztrmm_utcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c
@@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     svfloat32x2_t aj_vec = svld2(pn, ao);
 #endif
                     svst2(pn, b, aj_vec);
-                    ao += lda * 2;
+                    ao += lda;
                     b += n_active * 2;
                     X ++;
                     i ++;
@@ -95,8 +95,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     int temp = 0;
                     for (int j = 0; j < n_active; j++) {
                         for (int k = 0 ; k < j; k++) {
-                            b[temp++] = *(ao+j*lda+k);
-                            b[temp++] = *(ao+j*lda+k+1);
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
                         }
                         b[temp++] = ONE;
                         b[temp++] = ZERO;
@@ -109,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     int temp = 0;
                     for (int j = 0; j < n_active; j++) {
                         for (int k = 0 ; k <= j; k++) {
-                            b[temp++] = *(ao+j*lda+k);
-                            b[temp++] = *(ao+j*lda+k+1);
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
                         }
                         for (int k = j+1; k < n_active; k++) {
                             b[temp++] = ZERO;
@@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                         }
                     }
 #endif
-                    ao += n_active * lda * 2;
+                    ao += n_active * lda;
                     b += n_active*n_active * 2;
                     X += n_active;
                     i += n_active;

From 2e2c02b762afd67fe3cfb49620ab9df721f1a8ea Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Tue, 4 Jan 2022 14:42:07 +0100
Subject: [PATCH 26/77] fix sve ztrmm kernel

---
 kernel/arm64/ztrmm_kernel_sve_v1x4.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
index 1a81b4da0..b71a3d39e 100644
--- a/kernel/arm64/ztrmm_kernel_sve_v1x4.S
+++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
@@ -723,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	pB, pB, temp
 #endif
 #if defined(LEFT)
-	add	tempOffset, tempOffset, #4
+	add	tempOffset, tempOffset, lanes
 #endif
 
 	prfm	PLDL1KEEP, [pA]
@@ -856,7 +856,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	pB, pB, temp
 #endif
 #if defined(LEFT)
-	add	tempOffset, tempOffset, #4
+	add	tempOffset, tempOffset, lanes
 #endif
 
 .Lztrmm_kernel_L2_Mv1_END:
@@ -923,7 +923,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	tempK, tempOffset, #1
 #endif
 
-	asr	counterL , temp, #3		// counterL = counterL / 8
+	asr	counterL , tempK, #3		// counterL = counterL / 8
 	cmp	counterL , #0
 	ble	.Lztrmm_kernel_L1_Mv1_40
 	.align 5
@@ -972,7 +972,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	pB, pB, temp
 #endif
 #if defined(LEFT)
-	add	tempOffset, tempOffset, #4
+	add	tempOffset, tempOffset, lanes
 #endif
 
 .Lztrmm_kernel_L1_Mv1_END:

From 07fa6fa3b192f525f5bb8f36e7fc694095f53593 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 5 Jan 2022 08:57:51 +0100
Subject: [PATCH 27/77] configure Makefile for sve

---
 kernel/Makefile.L3 | 86 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 79 insertions(+), 7 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index da279b185..1c0931d96 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -1691,29 +1691,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N
 $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef CTRMMUNCOPY_M
+$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+ifdef CTRMMLNCOPY_M
+$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
+else
+$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+ifdef CTRMMUTCOPY_M
+$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
+$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef CTRMMLTCOPY_M
+$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
+$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -1929,11 +1961,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N)
 $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef CSYMMUCOPY_M
+$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef CSYMMLCOPY_M
+$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1941,11 +1983,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N)
 $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef ZSYMMUCOPY_M
+$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef ZSYMMLCOPY_M
+$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1965,11 +2017,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N
 $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
 
+ifdef CHEMMUTCOPY_M
+$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+else
 $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+endif
 
+ifdef CHEMMLTCOPY_M
+$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+else
 $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+endif
 
 $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
@@ -1977,11 +2039,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N
 $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
 
+ifdef ZHEMMUTCOPY_M
+$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+else
 $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+endif
 
+ifdef ZHEMMLTCOPY_M
+$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+else
 $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+endif
 
 $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@

From d30157d8914c812f97d1b4de7631ead7440b3d3e Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 5 Jan 2022 09:00:54 +0100
Subject: [PATCH 28/77] update configuration of kernels for A64FX and ARMV8SVE

---
 kernel/arm64/KERNEL.A64FX    | 29 +++++++++++++------
 kernel/arm64/KERNEL.ARMV8SVE | 54 +++++++++++++++++++++++++-----------
 2 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index ff5d3aa0e..76dda0c65 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -156,19 +156,30 @@ DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
 DSYMMUCOPY_M    =  symm_ucopy_sve.c
 DSYMMLCOPY_M    =  symm_lcopy_sve.c
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
+CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+CHEMMLTCOPY_M    =  chemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  chemm_utcopy_sve.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+
 ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
@@ -190,3 +201,5 @@ ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
 ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
 ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
 
+ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index 0364a929c..63dfde22f 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -156,28 +156,50 @@ DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
 DSYMMUCOPY_M    =  symm_ucopy_sve.c
 DSYMMLCOPY_M    =  symm_lcopy_sve.c
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
+CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
+CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+CHEMMLTCOPY_M    =  chemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  chemm_utcopy_sve.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+
+ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c

From 87537b8c553a3d79ae2123b36716cc22a20280b1 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 5 Jan 2022 09:07:28 +0100
Subject: [PATCH 29/77] modify sve zgemmcopy kernels

---
 kernel/arm64/zgemm_ncopy_sve_v1.c | 3 +--
 kernel/arm64/zgemm_tcopy_sve_v1.c | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
index 57035f4ff..8f9b4268a 100644
--- a/kernel/arm64/zgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     IFLOAT *aoffset, *aoffset1, *boffset;
 
     svint64_t lda_vec = svindex_s64(0LL, lda * 2);
-    uint64_t sve_size = svcntd();
 
     aoffset = a;
     boffset = b;
@@ -67,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
             aoffset1 += 2;
             boffset += active * 2;
         }
-        aoffset += sve_size * lda * 2;
+        aoffset += active * lda * 2;
 
         j += svcntd();
         pg = svwhilelt_b64(j, n);
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
index 32f217d7a..c6e50bc1c 100644
--- a/kernel/arm64/zgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -46,8 +46,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     BLASLONG j;
     IFLOAT *aoffset, *aoffset1, *boffset;
 
-    uint64_t sve_size = svcntd();
-
     aoffset = a;
     boffset = b;
 

From 18102ae8c317c0e2ba371ecff2d35b72132976e3 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 5 Jan 2022 09:09:18 +0100
Subject: [PATCH 30/77] add cgemm ctrmm sve kernels

---
 kernel/arm64/cgemm_kernel_sve_v1x4.S |  874 ++++++++++++++++++++++
 kernel/arm64/ctrmm_kernel_sve_v1x4.S | 1006 ++++++++++++++++++++++++++
 2 files changed, 1880 insertions(+)
 create mode 100644 kernel/arm64/cgemm_kernel_sve_v1x4.S
 create mode 100644 kernel/arm64/ctrmm_kernel_sve_v1x4.S

diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..38770f66b
--- /dev/null
+++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S
@@ -0,0 +1,874 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		w19
+#define alphaI		w20
+
+#define alphaz_R	z6.s
+#define alphaz_I	z7.s
+#define alpha0_R	s4
+#define alpha0_I	s5
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+	dup		z20.s, #0
+	dup		z21.s, #0
+	dup		z22.s, #0
+	dup		z23.s, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+	ld2w	{z2.s, z3.s}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+	fmla	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.s, p1/m, z0.s, z9.s
+#else
+	fmla	z17.s, p1/m, z0.s, z9.s
+#endif
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+
+	fmla	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.s, p1/m, z0.s, z11.s
+#else
+	fmla	z19.s, p1/m, z0.s, z11.s
+#endif
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+
+	fmla	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.s, p1/m, z0.s, z13.s
+#else
+	fmla	z21.s, p1/m, z0.s, z13.s
+#endif
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+
+	fmla	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.s, p1/m, z0.s, z15.s
+#else
+	fmla	z23.s, p1/m, z0.s, z15.s
+#endif
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2w	{z2.s, z3.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes *2 * 4
+
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+    ld1rw  z12.s, p0/z,  [pB, 16]
+    ld1rw  z13.s, p0/z,  [pB, 20]
+    ld1rw  z14.s, p0/z,  [pB, 24]
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 32
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	ld2w	{z26.s, z27.s}, p1/z, [pCRow1]
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ld2w	{z28.s, z29.s}, p1/z, [pCRow2]
+	fmla	z28.s, p1/m, z20.s, alphaz_R
+	fmls	z28.s, p1/m, z21.s, alphaz_I
+	fmla	z29.s, p1/m, z20.s, alphaz_I
+	fmla	z29.s, p1/m, z21.s, alphaz_R
+	st2w 	{z28.s, z29.s}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #3
+
+	ld2w	{z30.s, z31.s}, p1/z, [pCRow3]
+	fmla	z30.s, p1/m, z22.s, alphaz_R
+	fmls	z30.s, p1/m, z23.s, alphaz_I
+	fmla	z31.s, p1/m, z22.s, alphaz_I
+	fmla	z31.s, p1/m, z23.s, alphaz_R
+	st2w 	{z30.s, z31.s}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 16
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	ld2w	{z26.s, z27.s}, p1/z, [pCRow1]
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.s, #0
+	dup		z17.s, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, s0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, s1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 2 * 4
+    ptrue p0.s                  // create true predicate 
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lcgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lcgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lcgemm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM   
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3
+	cmp	counterL , #2
+	blt	.Lcgemm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lcgemm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L4_Mv1_22
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lcgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lcgemm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lcgemm_kernel_L4_Mv1_44
+
+
+.Lcgemm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lcgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lcgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lcgemm_kernel_L4_Mv1_46
+
+.Lcgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lcgemm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lcgemm_kernel_L4_Mv1_20   
+
+
+
+.Lcgemm_kernel_L4_END:
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp		// B = B + K * 4 * 4 * 2
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lcgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lcgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lcgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lcgemm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lcgemm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lcgemm_kernel_L2_Mv1_40
+	.align 5
+
+.Lcgemm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L2_Mv1_22
+
+
+.Lcgemm_kernel_L2_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lcgemm_kernel_L2_Mv1_100
+
+.Lcgemm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L2_Mv1_42
+
+.Lcgemm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+.Lcgemm_kernel_L2_Mv1_END:
+
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lcgemm_kernel_L2_Mv1_20   
+
+
+.Lcgemm_kernel_L2_END:
+	lsl	temp, origK, #4
+	add	origPB, origPB, temp // B = B + K * 2 * 4 * 2
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lcgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+.Lcgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lcgemm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lcgemm_kernel_L1_Mv1_40
+	.align 5
+
+.Lcgemm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L1_Mv1_22
+
+
+.Lcgemm_kernel_L1_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lcgemm_kernel_L1_Mv1_100
+
+.Lcgemm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L1_Mv1_42
+
+.Lcgemm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+.Lcgemm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lcgemm_kernel_L1_Mv1_20   
+
+.Lcgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/ctrmm_kernel_sve_v1x4.S b/kernel/arm64/ctrmm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..242968f63
--- /dev/null
+++ b/kernel/arm64/ctrmm_kernel_sve_v1x4.S
@@ -0,0 +1,1006 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		w19
+#define alphaI		w20
+#define temp		x21
+#define tempOffset	x22
+#define tempK		x23
+
+#define alphaz_R	z6.s
+#define alphaz_I	z7.s
+#define alpha0_R	s6
+#define alpha0_I	s7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+	dup		z20.s, #0
+	dup		z21.s, #0
+	dup		z22.s, #0
+	dup		z23.s, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+	ld2w	{z2.s, z3.s}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+	fmla	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.s, p1/m, z0.s, z9.s
+#else
+	fmla	z17.s, p1/m, z0.s, z9.s
+#endif
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+
+	fmla	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.s, p1/m, z0.s, z11.s
+#else
+	fmla	z19.s, p1/m, z0.s, z11.s
+#endif
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+
+	fmla	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.s, p1/m, z0.s, z13.s
+#else
+	fmla	z21.s, p1/m, z0.s, z13.s
+#endif
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+
+	fmla	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.s, p1/m, z0.s, z15.s
+#else
+	fmla	z23.s, p1/m, z0.s, z15.s
+#endif
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2w	{z2.s, z3.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes *2 * 4
+
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+    ld1rw  z12.s, p0/z,  [pB, 16]
+    ld1rw  z13.s, p0/z,  [pB, 20]
+    ld1rw  z14.s, p0/z,  [pB, 24]
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 32
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	eor	z28.d, z16.d, z16.d
+	eor	z29.d, z16.d, z16.d
+	fmla	z28.s, p1/m, z20.s, alphaz_R
+	fmls	z28.s, p1/m, z21.s, alphaz_I
+	fmla	z29.s, p1/m, z20.s, alphaz_I
+	fmla	z29.s, p1/m, z21.s, alphaz_R
+	st2w 	{z28.s, z29.s}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #3
+
+	eor	z30.d, z16.d, z16.d
+	eor	z31.d, z16.d, z16.d
+	fmla	z30.s, p1/m, z22.s, alphaz_R
+	fmls	z30.s, p1/m, z23.s, alphaz_I
+	fmla	z31.s, p1/m, z22.s, alphaz_I
+	fmla	z31.s, p1/m, z23.s, alphaz_R
+	st2w 	{z30.s, z31.s}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 16
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.s, #0
+	dup		z17.s, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, s0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, s1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 2 * 4
+    ptrue p0.s                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lctrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lctrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+.Lctrmm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM   
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3
+	cmp	counterL , #2
+	blt	.Lctrmm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lctrmm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L4_Mv1_22
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lctrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lctrmm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lctrmm_kernel_L4_Mv1_44
+
+
+.Lctrmm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lctrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lctrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lctrmm_kernel_L4_Mv1_46
+
+.Lctrmm_kernel_L4_Mv1_100:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lctrmm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lctrmm_kernel_L4_Mv1_20   
+
+
+
+.Lctrmm_kernel_L4_END:
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lctrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lctrmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lctrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lctrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lctrmm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lctrmm_kernel_L2_Mv1_40
+	.align 5
+
+.Lctrmm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L2_Mv1_22
+
+
+.Lctrmm_kernel_L2_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lctrmm_kernel_L2_Mv1_100
+
+.Lctrmm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L2_Mv1_42
+
+.Lctrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lctrmm_kernel_L2_Mv1_END:
+
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lctrmm_kernel_L2_Mv1_20   
+
+
+.Lctrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+	lsl	temp, origK, #4
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lctrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+.Lctrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lctrmm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lctrmm_kernel_L1_Mv1_40
+	.align 5
+
+.Lctrmm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L1_Mv1_22
+
+
+.Lctrmm_kernel_L1_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lctrmm_kernel_L1_Mv1_100
+
+.Lctrmm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L1_Mv1_42
+
+.Lctrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lctrmm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lctrmm_kernel_L1_Mv1_20   
+
+.Lctrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+

From 39ab2197048efca92d059f919987571cd92a903c Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 5 Jan 2022 09:12:22 +0100
Subject: [PATCH 31/77] sve copy functions for cgemm chemm zsymm

---
 kernel/arm64/cgemm_ncopy_sve_v1.c |  79 ++++++++++++++++
 kernel/arm64/cgemm_tcopy_sve_v1.c |  75 +++++++++++++++
 kernel/arm64/chemm_ltcopy_sve.c   | 107 +++++++++++++++++++++
 kernel/arm64/chemm_utcopy_sve.c   | 108 +++++++++++++++++++++
 kernel/arm64/zsymm_lcopy_sve.c    | 150 ++++++++++++++++++++++++++++++
 kernel/arm64/zsymm_ucopy_sve.c    | 150 ++++++++++++++++++++++++++++++
 param.h                           |   6 +-
 7 files changed, 673 insertions(+), 2 deletions(-)
 create mode 100644 kernel/arm64/cgemm_ncopy_sve_v1.c
 create mode 100644 kernel/arm64/cgemm_tcopy_sve_v1.c
 create mode 100644 kernel/arm64/chemm_ltcopy_sve.c
 create mode 100644 kernel/arm64/chemm_utcopy_sve.c
 create mode 100644 kernel/arm64/zsymm_lcopy_sve.c
 create mode 100644 kernel/arm64/zsymm_ucopy_sve.c

diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..6aa44a8f6
--- /dev/null
+++ b/kernel/arm64/cgemm_ncopy_sve_v1.c
@@ -0,0 +1,79 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint32_t lda_vec = svindex_s32(0, lda * 2);
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
+            svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec);
+            svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag));
+            aoffset1 += 2;
+            boffset += active * 2;
+        }
+        aoffset += active * lda * 2;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..748cd954e
--- /dev/null
+++ b/kernel/arm64/cgemm_tcopy_sve_v1.c
@@ -0,0 +1,75 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1);
+            svst2_f32(pg, (float *) boffset, a_vec);
+            aoffset1 += lda * 2;
+            boffset += active * 2;
+        }
+        aoffset += active * 2;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/chemm_ltcopy_sve.c b/kernel/arm64/chemm_ltcopy_sve.c
new file mode 100644
index 000000000..40cf9ea31
--- /dev/null
+++ b/kernel/arm64/chemm_ltcopy_sve.c
@@ -0,0 +1,107 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+  return 0;
+}
diff --git a/kernel/arm64/chemm_utcopy_sve.c b/kernel/arm64/chemm_utcopy_sve.c
new file mode 100644
index 000000000..440acdb1b
--- /dev/null
+++ b/kernel/arm64/chemm_utcopy_sve.c
@@ -0,0 +1,108 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, lda);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint32_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+        data_vec_imag = svneg_z(pg, data_vec_imag);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+  return 0;
+}
diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c
new file mode 100644
index 000000000..6f18aa956
--- /dev/null
+++ b/kernel/arm64/zsymm_lcopy_sve.c
@@ -0,0 +1,150 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+  lda *= 2;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint64_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c
new file mode 100644
index 000000000..6be48cdaf
--- /dev/null
+++ b/kernel/arm64/zsymm_ucopy_sve.c
@@ -0,0 +1,150 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+  lda *= 2;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, lda_vec);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint64_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, lda_vec);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint32_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/param.h b/param.h
index 8dd2a7461..5d46991a2 100644
--- a/param.h
+++ b/param.h
@@ -3325,11 +3325,13 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 
 #define DGEMM_DEFAULT_UNROLL_MN  32
 
-#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN  32
 
-#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN  32
 
 #define SGEMM_DEFAULT_P	128
 #define DGEMM_DEFAULT_P	160

From 0c91d043ae8d2dba0c7d3eeb2f63d17d9776c7e9 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 5 Jan 2022 14:36:39 +0100
Subject: [PATCH 32/77] adapt CMake for SVE

---
 kernel/CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 9849ddc93..717c1ea72 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -323,35 +323,61 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 
 
         #hemm
-      GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
+if (NOT DEFINED ${float_char}HEMMUTCOPY_M)
+    set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+    set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+    set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}")
+    set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}")
+endif()
+      GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type})
 
       # symm for c and z
+if (NOT DEFINED ${float_char}SYMMUCOPY_M)
+	set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
+	set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
+endif()
       GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
 
       GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
+	set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
+	set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
+	set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
+	set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
+endif ()
+      GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
 

From f33543d029199ee1bf0786e16ff0610a6711c726 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Wed, 5 Jan 2022 14:42:37 +0100
Subject: [PATCH 33/77] combine zchemm into single file

---
 kernel/arm64/KERNEL.A64FX       |   4 +-
 kernel/arm64/KERNEL.ARMV8SVE    |   4 +-
 kernel/arm64/chemm_ltcopy_sve.c | 107 -------------------------------
 kernel/arm64/chemm_utcopy_sve.c | 108 --------------------------------
 kernel/arm64/zhemm_ltcopy_sve.c |  66 +++++++++++++++++++
 kernel/arm64/zhemm_utcopy_sve.c |  65 +++++++++++++++++++
 6 files changed, 135 insertions(+), 219 deletions(-)
 delete mode 100644 kernel/arm64/chemm_ltcopy_sve.c
 delete mode 100644 kernel/arm64/chemm_utcopy_sve.c

diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index 76dda0c65..d74f0592d 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -174,8 +174,8 @@ CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
 CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
 CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
 
-CHEMMLTCOPY_M    =  chemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  chemm_utcopy_sve.c
+CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
 
 CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
 CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index 63dfde22f..66de642a5 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -174,8 +174,8 @@ CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
 CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
 CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
 
-CHEMMLTCOPY_M    =  chemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  chemm_utcopy_sve.c
+CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
 
 CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
 CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
diff --git a/kernel/arm64/chemm_ltcopy_sve.c b/kernel/arm64/chemm_ltcopy_sve.c
deleted file mode 100644
index 40cf9ea31..000000000
--- a/kernel/arm64/chemm_ltcopy_sve.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
-
-  int offset, i;
-
-  lda *= 2;
-
-  uint32_t sve_size = svcntw();
-  svint32_t posY_vec = svdup_s32(posY);
-  svint32_t posX_vec = svdup_s32(posX);
-  svint32_t lda_vec = svdup_s32(lda);
-  svint32_t one_vec = svdup_s32(1);
-
-  int32_t j = 0;
-  int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
-  int32_t active = svcntp_b32(svptrue_b32(), pg);
-  svint32_t index_neg = svindex_s32(0, -1);
-  svint32_t index = svindex_s32(0, 1);
-
-  do {
-    offset = posX - posY;
-    svint32_t vec_off = svdup_s32(offset);
-    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
-
-    svint32_t temp = svadd_z(pg, posX_vec, index);
-    svint32_t temp1 = svmul_z(pg, temp, 2);
-    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
-    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
-    temp2 = svmla_z(pg, temp2, posY_vec, 2);
-    svint32_t gat_ind = svsel(cmp, temp1, temp2);
-
-    i = m;
-    while (i>0) {
-        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
-        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
-
-        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
-        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
-        if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
-            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
-        }
-
-        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
-        // dealing with ZERO separately
-        if (offset > -active && offset < 1) 
-            b[ -2*offset + 1 ] = ZERO;
-
-        b += active * 2;
-        offset --;
-        vec_off = svsub_z(pg, vec_off, one_vec);
-        cmp = svcmpgt(pg, vec_off, index_neg);
-
-        i--;
-    }
-
-    posX += sve_size;
-    posX_vec = svdup_s32(posX);
-    j += sve_size;
-    pg = svwhilelt_b32(j, N);
-    active = svcntp_b32(svptrue_b32(), pg);
-  } while (svptest_any(svptrue_b32(), pg));
-
-  return 0;
-}
diff --git a/kernel/arm64/chemm_utcopy_sve.c b/kernel/arm64/chemm_utcopy_sve.c
deleted file mode 100644
index 440acdb1b..000000000
--- a/kernel/arm64/chemm_utcopy_sve.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
-
-  int offset, i;
-
-  lda *= 2;
-
-  uint32_t sve_size = svcntw();
-  svint32_t posY_vec = svdup_s32(posY);
-  svint32_t posX_vec = svdup_s32(posX);
-  svint32_t lda_vec = svdup_s32(lda);
-  svint32_t one_vec = svdup_s32(1);
-
-  int32_t j = 0;
-  int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
-  int32_t active = svcntp_b32(svptrue_b32(), pg);
-  svint32_t index_neg = svindex_s32(0, -1);
-  svint32_t index = svindex_s32(0, 1);
-
-  do {
-    offset = posX - posY;
-    svint32_t vec_off = svdup_s32(offset);
-    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
-
-    svint32_t temp = svadd_z(pg, posX_vec, index);
-    svint32_t temp1 = svmul_z(pg, temp, lda);
-    temp1 = svmla_z(pg, temp1, posY_vec, 2);
-    svint32_t temp2 = svmul_z(pg, temp, 2);
-    temp2 = svmla_z(pg, temp2, posY_vec, lda);
-    svint32_t gat_ind = svsel(cmp, temp1, temp2);
-
-    i = m;
-    while (i>0) {
-        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
-        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
-
-        gat_ind = svadd_m(cmp, gat_ind, 2);
-        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
-        data_vec_imag = svneg_z(pg, data_vec_imag);
-        if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
-            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
-        }
-
-        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
-        // dealing with ZERO separately
-        if (offset > -active && offset < 1) 
-            b[ -2*offset + 1 ] = ZERO;
-
-        b += active * 2;
-        offset --;
-        vec_off = svsub_z(pg, vec_off, one_vec);
-        cmp = svcmpgt(pg, vec_off, index_neg);
-
-        i--;
-    }
-
-    posX += sve_size;
-    posX_vec = svdup_s32(posX);
-    j += sve_size;
-    pg = svwhilelt_b32(j, N);
-    active = svcntp_b32(svptrue_b32(), pg);
-  } while (svptest_any(svptrue_b32(), pg));
-
-  return 0;
-}
diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
index 58e9ff589..37dbfe4e1 100644
--- a/kernel/arm64/zhemm_ltcopy_sve.c
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -42,6 +42,7 @@
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
 
+#if defined(DOUBLE)
   BLASLONG offset, i;
 
   lda *= 2;
@@ -102,5 +103,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
+#else
+
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
   return 0;
 }
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
index 9ddbf6cbd..21e03b7be 100644
--- a/kernel/arm64/zhemm_utcopy_sve.c
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -42,6 +42,7 @@
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
 
+#if defined(DOUBLE)
   BLASLONG offset, i;
 
   lda *= 2;
@@ -102,6 +103,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     pg = svwhilelt_b64(j, n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
+#else
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, lda);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint32_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+        data_vec_imag = svneg_z(pg, data_vec_imag);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
 
   return 0;
 }

From bb33446b409a388b05d918dd251efd4b445e6f47 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Thu, 6 Jan 2022 10:26:11 +0100
Subject: [PATCH 34/77] fix makefile.L3

---
 kernel/Makefile.L3 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 1c0931d96..2a10ac980 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -1712,10 +1712,10 @@ $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
 else
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
 endif
 
@@ -1726,10 +1726,10 @@ $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
 $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
 else
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
 endif
 
@@ -1740,10 +1740,10 @@ $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
 else
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
 endif
 

From cbcea149f0ed0bf966dafb5bd5b6612945b54858 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Thu, 6 Jan 2022 10:29:35 +0100
Subject: [PATCH 35/77] update contributors

---
 CONTRIBUTORS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 39ec96246..879aaebe3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -201,3 +201,5 @@ In chronological order:
 * Bine Brank <https://github.com/binebrank>
   * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
   * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
+  * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions
+  * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions

From 19c8f615dc507e20aee724aedb572bdddc2cd497 Mon Sep 17 00:00:00 2001
From: Sunita Nadampalli <nadampal@amazon.com>
Date: Fri, 7 Jan 2022 00:28:17 +0000
Subject: [PATCH 36/77] OpenBLAS: aarch64: Add neoverse-v1/n2 architecture
 specifics

---
 Makefile.arm64                 |  60 +++++++++++
 Makefile.system                |   3 +
 TargetList.txt                 |   2 +
 cmake/arch.cmake               |   2 +-
 cmake/prebuild.cmake           |  62 ++++++++++-
 cpuid_arm64.c                  |  44 +++++++-
 driver/others/dynamic_arm64.c  |   2 +
 getarch.c                      |  37 ++++++-
 kernel/arm64/KERNEL.NEOVERSEN2 | 189 +++++++++++++++++++++++++++++++++
 kernel/arm64/KERNEL.NEOVERSEV1 | 189 +++++++++++++++++++++++++++++++++
 param.h                        |  58 ++++++++++
 11 files changed, 641 insertions(+), 7 deletions(-)
 create mode 100644 kernel/arm64/KERNEL.NEOVERSEN2
 create mode 100644 kernel/arm64/KERNEL.NEOVERSEV1

diff --git a/Makefile.arm64 b/Makefile.arm64
index 801601030..2eade8d78 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -78,6 +78,66 @@ endif
 endif
 endif
 
+# Use a72 tunings because Neoverse-V1 is only available
+# in GCC>=9.4
+ifeq ($(CORE), NEOVERSEV1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
+endif
+else
+CCOMMON_OPT += -march=armv8.4-a -mtune=native
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.4-a -mtune=native
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+endif
+endif
+endif
+
+# Use a72 tunings because Neoverse-N2 is only available
+# in GCC>=9.4
+ifeq ($(CORE), NEOVERSEN2)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
+endif
+else
+CCOMMON_OPT += -march=armv8.5-a -mtune=native
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.5-a -mtune=native
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+endif
+endif
+endif
+
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
diff --git a/Makefile.system b/Makefile.system
index 97fdc3f91..9203f49cb 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -374,6 +374,7 @@ else
 endif
 GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
 GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
+GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif
 
@@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57
 DYNAMIC_CORE += CORTEXA72
 DYNAMIC_CORE += CORTEXA73
 DYNAMIC_CORE += NEOVERSEN1
+DYNAMIC_CORE += NEOVERSEV1
+DYNAMIC_CORE += NEOVERSEN2
 DYNAMIC_CORE += CORTEXA55
 DYNAMIC_CORE += FALKOR
 DYNAMIC_CORE += THUNDERX
diff --git a/TargetList.txt b/TargetList.txt
index b02a011d5..97c8a8f06 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -93,6 +93,8 @@ CORTEXA57
 CORTEXA72
 CORTEXA73
 NEOVERSEN1
+NEOVERSEV1
+NEOVERSEN2
 CORTEXA55
 EMAG8180
 FALKOR
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index d468eb60b..f4a135e82 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -44,7 +44,7 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
     if (DYNAMIC_LIST)
 	    set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
     endif ()
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 259d9c738..5f12bb145 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -243,11 +243,11 @@ endif ()
       "#define L1_CODE_ASSOCIATIVE\t4\n"
       "#define L1_DATA_SIZE\t65536\n"
       "#define L1_DATA_LINESIZE\t64\n"
-      "#define L1_DATA_ASSOCIATIVE\t2\n"
+      "#define L1_DATA_ASSOCIATIVE\t4\n"
       "#define L2_SIZE\t1048576\n\n"
       "#define L2_LINESIZE\t64\n"
-      "#define L2_ASSOCIATIVE\t16\n"
-      "#define DTB_DEFAULT_ENTRIES\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t48\n"
       "#define DTB_SIZE\t4096\n"
       "#define HAVE_VFPV4\n"
       "#define HAVE_VFPV3\n"
@@ -263,6 +263,62 @@ endif ()
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "NEOVERSEV1")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_CODE_SIZE\t65536\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t4\n"
+      "#define L1_DATA_SIZE\t65536\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t4\n"
+      "#define L2_SIZE\t1048576\n\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t48\n"
+      "#define DTB_SIZE\t4096\n"
+      "#define HAVE_VFPV4\n"
+      "#define HAVE_VFPV3\n"
+      "#define HAVE_VFP\n"
+      "#define HAVE_NEON\n"
+      "#define HAVE_SVE\n"
+      "#define ARMV8\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_CODE_SIZE\t65536\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t4\n"
+      "#define L1_DATA_SIZE\t65536\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t2\n"
+      "#define L2_SIZE\t1048576\n\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t48\n"
+      "#define DTB_SIZE\t4096\n"
+      "#define HAVE_VFPV4\n"
+      "#define HAVE_VFPV3\n"
+      "#define HAVE_VFP\n"
+      "#define HAVE_NEON\n"
+      "#define HAVE_SVE\n"
+      "#define ARMV8\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
   elseif ("${TCORE}" STREQUAL "FALKOR")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t65536\n"
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 958e94abc..cc3a82815 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -43,6 +43,8 @@ size_t length64=sizeof(value64);
 #define CPU_CORTEXA72     4
 #define CPU_CORTEXA73     5
 #define CPU_NEOVERSEN1    11
+#define CPU_NEOVERSEV1    16
+#define CPU_NEOVERSEN2    17
 // Qualcomm
 #define CPU_FALKOR        6
 // Cavium
@@ -71,6 +73,8 @@ static char *cpuname[] = {
   "TSV110",
   "EMAG8180",
   "NEOVERSEN1",
+  "NEOVERSEV1"
+  "NEOVERSEN2"
   "THUNDERX3T110",
   "VORTEX",
   "CORTEXA55",
@@ -90,6 +94,8 @@ static char *cpuname_lower[] = {
   "tsv110",
   "emag8180",
   "neoversen1",
+  "neoversev1",
+  "neoversen2",
   "thunderx3t110",
   "vortex",
   "cortexa55",
@@ -170,6 +176,10 @@ int detect(void)
         return CPU_CORTEXA73;
       else if (strstr(cpu_part, "0xd0c"))
         return CPU_NEOVERSEN1;
+      else if (strstr(cpu_part, "0xd40"))
+        return CPU_NEOVERSEV1;
+      else if (strstr(cpu_part, "0xd49"))
+        return CPU_NEOVERSEN2;
       else if (strstr(cpu_part, "0xd05"))
 	return CPU_CORTEXA55;
     }
@@ -338,11 +348,41 @@ void get_cpuconfig(void)
 		printf("#define L1_DATA_ASSOCIATIVE 4\n");
 		printf("#define L2_SIZE 1048576\n");
 		printf("#define L2_LINESIZE 64\n");
-		printf("#define L2_ASSOCIATIVE 16\n");
-		printf("#define DTB_DEFAULT_ENTRIES 64\n");
+		printf("#define L2_ASSOCIATIVE 8\n");
+		printf("#define DTB_DEFAULT_ENTRIES 48\n");
 		printf("#define DTB_SIZE 4096\n");
 		break;
 
+	    case CPU_NEOVERSEV1:
+                printf("#define %s\n", cpuname[d]);
+                printf("#define L1_CODE_SIZE 65536\n");
+                printf("#define L1_CODE_LINESIZE 64\n");
+                printf("#define L1_CODE_ASSOCIATIVE 4\n");
+                printf("#define L1_DATA_SIZE 65536\n");
+                printf("#define L1_DATA_LINESIZE 64\n");
+                printf("#define L1_DATA_ASSOCIATIVE 4\n");
+                printf("#define L2_SIZE 1048576\n");
+                printf("#define L2_LINESIZE 64\n");
+                printf("#define L2_ASSOCIATIVE 8\n");
+                printf("#define DTB_DEFAULT_ENTRIES 48\n");
+                printf("#define DTB_SIZE 4096\n");
+                break;
+
+	    case CPU_NEOVERSEN2:
+                printf("#define %s\n", cpuname[d]);
+                printf("#define L1_CODE_SIZE 65536\n");
+                printf("#define L1_CODE_LINESIZE 64\n");
+                printf("#define L1_CODE_ASSOCIATIVE 4\n");
+                printf("#define L1_DATA_SIZE 65536\n");
+                printf("#define L1_DATA_LINESIZE 64\n");
+                printf("#define L1_DATA_ASSOCIATIVE 4\n");
+                printf("#define L2_SIZE 1048576\n");
+                printf("#define L2_LINESIZE 64\n");
+                printf("#define L2_ASSOCIATIVE 8\n");
+                printf("#define DTB_DEFAULT_ENTRIES 48\n");
+                printf("#define DTB_SIZE 4096\n");
+                break;
+
 	    case CPU_FALKOR:
 	        printf("#define FALKOR\n");
 	        printf("#define L1_CODE_SIZE 65536\n");
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 04ceaaf6d..45ea9f113 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -147,6 +147,8 @@ static char *corename[] = {
   "tsv110",
   "emag8180",
   "neoversen1",
+  "neoversev1",
+  "neoversen2",
   "thunderx3t110",
   "cortexa55",
   "unknown"
diff --git a/getarch.c b/getarch.c
index 6063a2a1d..73bbf1892 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
        "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \
-       "-march=armv8.2-a -mtune=cortex-a72"
+       "-march=armv8.2-a -mtune=neoverse-n1"
 #define LIBNAME   "neoversen1"
 #define CORENAME  "NEOVERSEN1"
 #else
 #endif
 
+#ifdef FORCE_NEOVERSEV1
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "NEOVERSEV1"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DNEOVERSEV1 " \
+       "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
+       "-march=armv8.4-a -mtune=neoverse-v1"
+#define LIBNAME   "neoversev1"
+#define CORENAME  "NEOVERSEV1"
+#else
+#endif
+
+
+#ifdef FORCE_NEOVERSEN2
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "NEOVERSEN2"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DNEOVERSEN2 " \
+       "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
+       "-march=armv8.5-a -mtune=neoverse-n2"
+#define LIBNAME   "neoversen2"
+#define CORENAME  "NEOVERSEN2"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA55
 #define FORCE
 #define ARCHITECTURE    "ARM64"
diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2
new file mode 100644
index 000000000..ea010db42
--- /dev/null
+++ b/kernel/arm64/KERNEL.NEOVERSEN2
@@ -0,0 +1,189 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot_thunderx2t99.c
+SDOTKERNEL     = dot_thunderx2t99.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
new file mode 100644
index 000000000..ea010db42
--- /dev/null
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -0,0 +1,189 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot_thunderx2t99.c
+SDOTKERNEL     = dot_thunderx2t99.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/param.h b/param.h
index 8dd2a7461..eb4dcb8f0 100644
--- a/param.h
+++ b/param.h
@@ -3307,6 +3307,64 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
+#elif defined(NEOVERSEV1)
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#elif defined(NEOVERSEN2)
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
 #elif defined(ARMV8SVE) || defined(A64FX)
 
 /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".

From 15d4b379138b9a5b84a2fbc2d37cb47b33efdeec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 8 Jan 2022 23:48:13 +0100
Subject: [PATCH 37/77] SkylakeX: match parameters to dgemm kernels for
 dyn/non-dyn

---
 param.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/param.h b/param.h
index 2dffaae3c..4155131f0 100644
--- a/param.h
+++ b/param.h
@@ -1669,10 +1669,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #define SGEMM_DEFAULT_UNROLL_M 16
-#ifndef DYNAMIC_ARCH
-#define DGEMM_DEFAULT_UNROLL_M 16
-#else
+#ifdef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_UNROLL_M 4
+#else
+#define DGEMM_DEFAULT_UNROLL_M 16
 #endif
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 8
@@ -1680,10 +1680,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_UNROLL_N 4
-#ifndef DYNAMIC_ARCH
-#define DGEMM_DEFAULT_UNROLL_N 2
-#else
+#ifdef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_UNROLL_N 8
+#else
+#define DGEMM_DEFAULT_UNROLL_N 2
 #endif
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 2
@@ -1718,17 +1718,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #define SGEMM_DEFAULT_P 448
+#ifndef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_P 192
+#else
+#define DGEMM_DEFAULT_P 384
+#endif
 #define CGEMM_DEFAULT_P 384
 #define ZGEMM_DEFAULT_P 256
 
 #define SGEMM_DEFAULT_Q 448
+#ifndef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_Q 384
+#else
+#define DGEMM_DEFAULT_Q 168
+#endif
 #define CGEMM_DEFAULT_Q 192
 #define ZGEMM_DEFAULT_Q 128
 
 #define SGEMM_DEFAULT_R sgemm_r
+#ifndef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_R 8640
+#else
+#define DGEMM_DEFAULT_R 13824
+#endif
 #define CGEMM_DEFAULT_R cgemm_r
 #define ZGEMM_DEFAULT_R zgemm_r
 

From f1ac59f20057cefe4dd45122954e2403f1330835 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 8 Jan 2022 23:48:58 +0100
Subject: [PATCH 38/77] Forward DYNAMIC_ARCH option to Makefile.prebuild

---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 97fdc3f91..7909f677a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -277,7 +277,7 @@ HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo
 GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
 
 # Generating Makefile.conf and config.h
-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
+DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
 
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf

From 2573ccfb2e02abec3f537479d65b58c4d6e746f3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 8 Jan 2022 23:50:34 +0100
Subject: [PATCH 39/77] make DYNAMIC_ARCH option available to
 getarch_2nd/param.h

---
 Makefile.prebuild | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.prebuild b/Makefile.prebuild
index d6395da7b..399db956f 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -3,6 +3,10 @@
 export BINARY
 export USE_OPENMP
 
+ifdef DYNAMIC_ARCH
+override HOST_CFLAGS += -DDYNAMIC_ARCH
+endif
+
 ifdef TARGET_CORE
 TARGET_MAKE = Makefile_kernel.conf
 TARGET_CONF = config_kernel.h

From be7e55880c91d626a667aff699447c3ba5ab280e Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sun, 9 Jan 2022 19:40:04 +0100
Subject: [PATCH 40/77] sve trsm_kernel_LN

---
 kernel/arm64/trsm_kernel_LN_sve.c | 301 ++++++++++++++++++++++++++++++
 1 file changed, 301 insertions(+)
 create mode 100644 kernel/arm64/trsm_kernel_LN_sve.c

diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c
new file mode 100644
index 000000000..8ca10036b
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_LN_sve.c
@@ -0,0 +1,301 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (m - 1) * m;
+  b += (m - 1) * n;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = 0; k < i; k ++){
+        *(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a -= m;
+    b -= 2 * n;
+  }
+
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+  a += (m - 1) * m * 2;
+  b += (m - 1) * n * 2;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+        *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+        *(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a -= m * 2;
+    b -= 4 * n;
+  }
+
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  int sve_size = svcntd();
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+      m, n, k, offset);
+#endif
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    i = m % sve_size;
+    if (i) {
+      aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+      cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+      if (k - kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa + i             * kk * COMPSIZE,
+            b  + GEMM_UNROLL_N * kk * COMPSIZE,
+            cc,
+            ldc);
+      }
+
+      solve(i, GEMM_UNROLL_N,
+          aa + (kk - i) * i             * COMPSIZE,
+          b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      kk -= i;
+
+    }
+
+    i = sve_size;
+    if (i <= m) {
+      aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE;
+      cc = c + ((m & ~(sve_size - 1)) - sve_size)     * COMPSIZE;
+
+      do {
+        if (k - kk > 0) {
+          GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+              ZERO,
+#endif
+              aa + sve_size * kk * COMPSIZE,
+              b +  sve_size * kk * COMPSIZE,
+              cc,
+              ldc);
+        }
+
+        solve(sve_size, GEMM_UNROLL_N,
+            aa + (kk - sve_size) * sve_size * COMPSIZE,
+            b  + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE,
+            cc, ldc);
+
+        aa -= sve_size * k * COMPSIZE;
+        cc -= sve_size     * COMPSIZE;
+        kk -= sve_size;
+
+        i += sve_size;
+      } while (i <= m);
+    }
+
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = m + offset;
+
+        i = m % sve_size;
+        if (i) {
+          aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+          cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - i) * i * COMPSIZE,
+              b  + (kk - i) * j * COMPSIZE,
+              cc, ldc);
+
+          kk -= i;
+
+        }
+
+        i = sve_size;
+        if (i <= m) {
+          aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
+          cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(sve_size, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + sve_size * kk * COMPSIZE,
+                  b +  j             * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(sve_size, j,
+                aa + (kk - sve_size) * sve_size * COMPSIZE,
+                b  + (kk - sve_size) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa -= sve_size * k * COMPSIZE;
+            cc -= sve_size     * COMPSIZE;
+            kk -= sve_size;
+
+            i += sve_size;
+          } while (i <= m);
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}

From 098672b51b0c3a903be4be951ff60741cba43664 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sun, 9 Jan 2022 20:11:47 +0100
Subject: [PATCH 41/77] add trsm_kernel_LT_sve

---
 kernel/arm64/trsm_kernel_LN_sve.c |  21 ++-
 kernel/arm64/trsm_kernel_LT_sve.c | 290 ++++++++++++++++++++++++++++++
 2 files changed, 307 insertions(+), 4 deletions(-)
 create mode 100644 kernel/arm64/trsm_kernel_LT_sve.c

diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c
index 8ca10036b..c29c3b57a 100644
--- a/kernel/arm64/trsm_kernel_LN_sve.c
+++ b/kernel/arm64/trsm_kernel_LN_sve.c
@@ -47,9 +47,22 @@ static FLOAT dm1 = -1.;
 #define GEMM_KERNEL   GEMM_KERNEL_N
 #endif
 
-#if GEMM_DEFAULT_UNROLL_M == 16
-#define GEMM_UNROLL_M_SHIFT 4
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
 #endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
 #if GEMM_DEFAULT_UNROLL_N == 16
 #define GEMM_UNROLL_N_SHIFT 4
 #endif
@@ -262,8 +275,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
 
         i = sve_size;
         if (i <= m) {
-          aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
-          cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+          aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE;
+          cc = c + ((m & ~(sve_size - 1)) - sve_size)     * COMPSIZE;
 
           do {
             if (k - kk > 0) {
diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c
new file mode 100644
index 000000000..a35696836
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_LT_sve.c
@@ -0,0 +1,290 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < m; i++) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = i + 1; k < m; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a += m;
+  }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = i + 1; k < m; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a += m * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+  int sve_size = svcntd();
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = sve_size;
+
+    while (i <= m) {
+
+      if (kk > 0) {
+        GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+
+      solve(sve_size, GEMM_UNROLL_N,
+          aa + kk * sve_size * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += sve_size * k * COMPSIZE;
+      cc += sve_size     * COMPSIZE;
+      kk += sve_size;
+      i += sve_size;
+    }
+
+    i = m % sve_size;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+      kk += i;
+
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += sve_size;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = offset;
+        aa = a;
+        cc = c;
+
+        i = sve_size;
+
+        while (i <= m) {
+          if (kk > 0) {
+            GEMM_KERNEL(sve_size, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(sve_size, j,
+              aa + kk * sve_size * COMPSIZE,
+              b  + kk * j             * COMPSIZE, cc, ldc);
+
+          aa += sve_size * k * COMPSIZE;
+          cc += sve_size     * COMPSIZE;
+          kk += sve_size;
+          i += sve_size;
+        }
+
+        i = sve_size % m;
+        if (i) {
+          if (kk > 0) {
+            GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(i, j,
+              aa + kk * i * COMPSIZE,
+              b  + kk * j * COMPSIZE, cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+          kk += i;
+
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}

From a9e297e4764faa53b146de1b0c3ed82e2632e42c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Jan 2022 23:31:59 +0100
Subject: [PATCH 42/77] Fix handling of ifdef/ifndef

---
 cmake/utils.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index c5ee65384..56c1cb060 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -125,7 +125,7 @@ macro(ParseMakefileVars MAKEFILE_IN)
     if (NOT "${line_match}" STREQUAL "")
       #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}")
       set (ElseSeen 0)
-      if (DEFINED ${CMAKE_MATCH_2})
+      if (${CMAKE_MATCH_2})
         if (${CMAKE_MATCH_1} STREQUAL "ifdef")
           #message (STATUS "condition is true")
           set (IfElse 1)

From e8939b3d30e090b162303fcfbec2e7479a98ca6c Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Mon, 10 Jan 2022 20:42:20 +0100
Subject: [PATCH 43/77] sve trsmRN and trsmRT

---
 kernel/arm64/trsm_kernel_LT_sve.c |   1 +
 kernel/arm64/trsm_kernel_RN_sve.c | 289 +++++++++++++++++++++++++++
 kernel/arm64/trsm_kernel_RT_sve.c | 313 ++++++++++++++++++++++++++++++
 3 files changed, 603 insertions(+)
 create mode 100644 kernel/arm64/trsm_kernel_RN_sve.c
 create mode 100644 kernel/arm64/trsm_kernel_RT_sve.c

diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c
index a35696836..7f5459702 100644
--- a/kernel/arm64/trsm_kernel_LT_sve.c
+++ b/kernel/arm64/trsm_kernel_LT_sve.c
@@ -37,6 +37,7 @@
 /*********************************************************************/
 
 #include "common.h"
+#include "arm_sve.h"
 
 static FLOAT dm1 = -1.;
 
diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c
new file mode 100644
index 000000000..2f6611c1c
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_RN_sve.c
@@ -0,0 +1,289 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < n; i++) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a  = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = i + 1; k < n; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b += n;
+  }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < n; i++) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1 + aa2 * bb2;
+      cc2 = -aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = i + 1; k < n; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b += n * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+  int sve_size = svcntd();
+
+#if 0
+  fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = sve_size;
+
+    if (i <= m) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	solve(sve_size, GEMM_UNROLL_N,
+	      aa + kk * sve_size * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+
+	aa += sve_size * k * COMPSIZE;
+	cc += sve_size     * COMPSIZE;
+	i += sve_size;
+      } while (i <= m);
+    }
+
+
+    i = m % sve_size;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += sve_size;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+  i = sve_size;
+
+	while (i <= m) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(sve_size, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(sve_size, j,
+		aa + kk * sve_size * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += sve_size * k * COMPSIZE;
+	  cc += sve_size     * COMPSIZE;
+	  i += sve_size;
+	}
+
+  i = m % sve_size;
+  if (i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+  }
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c
new file mode 100644
index 000000000..d93ebe7ad
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_RT_sve.c
@@ -0,0 +1,313 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  a += (n - 1) * m * 2;
+  b += (n - 1) * n * 2;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1  + aa2 * bb2;
+      cc2 = - aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b -= n * 2;
+    a -= 4 * m;
+  }
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  int sve_size = svcntd();
+
+#if 0
+  fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+      m, n, k, offset);
+#endif
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+        aa  = a;
+        b -= j * k  * COMPSIZE;
+        c -= j * ldc* COMPSIZE;
+        cc  = c;
+
+        i = sve_size;
+        if (i <= m) {
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(sve_size, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + sve_size * kk * COMPSIZE,
+                  b  +  j            * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(sve_size, j,
+                aa + (kk - j) * sve_size * COMPSIZE,
+                b  + (kk - j) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa += sve_size * k * COMPSIZE;
+            cc += sve_size     * COMPSIZE;
+            i += sve_size;
+          } while (i <= m);
+        }
+
+        i = m % sve_size;
+        if (i) {
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - j) * i * COMPSIZE,
+              b  + (kk - j) * j * COMPSIZE,
+              cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+
+        }
+        kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = sve_size;
+      if (i <= m) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  i += sve_size;
+	} while (i <= m);
+      }
+
+      i = m % sve_size;
+      if (i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+

From f87468ac916c7a64a9d8256bb6b81a36245f3bae Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Mon, 10 Jan 2022 21:45:37 +0100
Subject: [PATCH 44/77] trsm_lncopy_sve

---
 kernel/arm64/trsm_lncopy_sve.c | 114 +++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 kernel/arm64/trsm_lncopy_sve.c

diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
new file mode 100644
index 000000000..d96a1f383
--- /dev/null
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -0,0 +1,114 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+  int js = 0;
+#ifdef DOUBLE
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, n);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + j * n_active + k) = *(a + k * lda + j);
+          }
+          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+        }
+      }
+
+      if (ii > jj) {
+        for (int j = 0; j < n_active; j++) {
+          svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+          svst1(pn, b, aj_vec);
+          ao++;
+        }
+
+      }
+
+      b += n_active * n_active;
+
+      i += n_active;
+      ii += n_active;
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, n);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}

From 8071e179f1ba0c65da0841cc533d0f8d6b15c6ef Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Tue, 11 Jan 2022 21:16:38 +0100
Subject: [PATCH 45/77] add remaining sve trsm copy kernels

---
 kernel/arm64/trsm_ltcopy_sve.c | 114 +++++++++++++++++++++++++++++++++
 kernel/arm64/trsm_uncopy_sve.c | 113 ++++++++++++++++++++++++++++++++
 kernel/arm64/trsm_utcopy_sve.c | 114 +++++++++++++++++++++++++++++++++
 3 files changed, 341 insertions(+)
 create mode 100644 kernel/arm64/trsm_ltcopy_sve.c
 create mode 100644 kernel/arm64/trsm_uncopy_sve.c
 create mode 100644 kernel/arm64/trsm_utcopy_sve.c

diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
new file mode 100644
index 000000000..9012f7fe5
--- /dev/null
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -0,0 +1,114 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+  int js = 0;
+#ifdef DOUBLE
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, n);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + j * n_active + k) = *(a + j * lda + k);
+          }
+        }
+      }
+
+      if (ii < jj) {
+        for (int j = 0; j < n_active; j++) {
+          svfloat64_t aj_vec = svld1(pn, ao);
+          svst1(pn, b, aj_vec);
+          ao += lda;
+        }
+
+      }
+
+      b += n_active * n_active;
+
+      i += n_active;
+      ii += n_active;
+    } while (i < m);
+
+
+    a += n_active;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, n);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
new file mode 100644
index 000000000..242e99f60
--- /dev/null
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -0,0 +1,113 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+  int js = 0;
+#ifdef DOUBLE
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, n);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + j * n_active + k) = *(a + k * lda + j);
+          }
+        }
+      }
+
+      if (ii < jj) {
+        for (int j = 0; j < n_active; j++) {
+          svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+          svst1(pn, b, aj_vec);
+          ao++;
+        }
+      }
+
+      b += n_active * n_active;
+
+      i += n_active;
+      ii += n_active;
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, n);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
new file mode 100644
index 000000000..9eefb8c18
--- /dev/null
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -0,0 +1,114 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+  int js = 0;
+#ifdef DOUBLE
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, n);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + j * n_active + k) = *(a + j * lda + k);
+          }
+          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+        }
+      }
+
+      if (ii > jj) {
+        for (int j = 0; j < n_active; j++) {
+          svfloat64_t aj_vec = svld1(pn, ao);
+          svst1(pn, b, aj_vec);
+          ao += lda;
+        }
+
+      }
+
+      b += n_active * n_active;
+
+      i += n_active;
+      ii += n_active;
+    } while (i < m);
+
+
+    a += n_active;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, n);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}

From aaa2b1a861623eb012288c2b401fa923933da55c Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sat, 15 Jan 2022 21:02:14 +0100
Subject: [PATCH 46/77] fix sve dtrsm kernels

---
 kernel/arm64/trsm_kernel_LN_sve.c | 20 ++++++++++--------
 kernel/arm64/trsm_kernel_LT_sve.c |  2 +-
 kernel/arm64/trsm_kernel_RT_sve.c | 12 +++++------
 kernel/arm64/trsm_lncopy_sve.c    | 30 +++++++++++++--------------
 kernel/arm64/trsm_ltcopy_sve.c    | 32 ++++++++++++++---------------
 kernel/arm64/trsm_uncopy_sve.c    | 29 +++++++++++++-------------
 kernel/arm64/trsm_utcopy_sve.c    | 34 +++++++++++++++----------------
 7 files changed, 79 insertions(+), 80 deletions(-)

diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c
index c29c3b57a..57f79ac3a 100644
--- a/kernel/arm64/trsm_kernel_LN_sve.c
+++ b/kernel/arm64/trsm_kernel_LN_sve.c
@@ -182,8 +182,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
 
     i = m % sve_size;
     if (i) {
-      aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
-      cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+      aa = a + (m - i) * k * COMPSIZE;
+      cc = c + (m - i)     * COMPSIZE;
 
       if (k - kk > 0) {
         GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
@@ -205,10 +205,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
 
     }
 
+    int mod = i;
     i = sve_size;
     if (i <= m) {
-      aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE;
-      cc = c + ((m & ~(sve_size - 1)) - sve_size)     * COMPSIZE;
+      aa = a + (m - mod - sve_size) * k * COMPSIZE;
+      cc = c + (m - mod - sve_size)     * COMPSIZE;
 
       do {
         if (k - kk > 0) {
@@ -217,7 +218,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
               ZERO,
 #endif
               aa + sve_size * kk * COMPSIZE,
-              b +  sve_size * kk * COMPSIZE,
+              b +  GEMM_UNROLL_N * kk * COMPSIZE,
               cc,
               ldc);
         }
@@ -251,8 +252,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
 
         i = m % sve_size;
         if (i) {
-          aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
-          cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+          aa = a + (m - i) * k * COMPSIZE;
+          cc = c + (m - i)     * COMPSIZE;
 
           if (k - kk > 0) {
             GEMM_KERNEL(i, j, k - kk, dm1,
@@ -273,10 +274,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
 
         }
 
+        int mod = i;
         i = sve_size;
         if (i <= m) {
-          aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE;
-          cc = c + ((m & ~(sve_size - 1)) - sve_size)     * COMPSIZE;
+          aa = a + (m - mod - sve_size) * k * COMPSIZE;
+          cc = c + (m - mod - sve_size)     * COMPSIZE;
 
           do {
             if (k - kk > 0) {
diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c
index 7f5459702..8c6a57a6d 100644
--- a/kernel/arm64/trsm_kernel_LT_sve.c
+++ b/kernel/arm64/trsm_kernel_LT_sve.c
@@ -257,7 +257,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
           i += sve_size;
         }
 
-        i = sve_size % m;
+        i = m % sve_size;
         if (i) {
           if (kk > 0) {
             GEMM_KERNEL(i, j, kk, dm1,
diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c
index d93ebe7ad..efafc9d11 100644
--- a/kernel/arm64/trsm_kernel_RT_sve.c
+++ b/kernel/arm64/trsm_kernel_RT_sve.c
@@ -258,23 +258,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
       if (i <= m) {
 	do {
 	  if (k - kk > 0) {
-	    GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+	    GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
 #ifdef COMPLEX
 			ZERO,
 #endif
-			aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			aa + sve_size * kk * COMPSIZE,
 			b  + GEMM_UNROLL_N * kk * COMPSIZE,
 			cc,
 			ldc);
 	  }
 
-	  solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
-		aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+	  solve(sve_size, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE,
 		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
 		cc, ldc);
 
-	  aa += GEMM_UNROLL_M * k * COMPSIZE;
-	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  aa += sve_size * k * COMPSIZE;
+	  cc += sve_size     * COMPSIZE;
 	  i += sve_size;
 	} while (i <= m);
       }
diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
index d96a1f383..7f480dcad 100644
--- a/kernel/arm64/trsm_lncopy_sve.c
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -48,17 +48,18 @@
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
 
-  BLASLONG i, ii, j, jj;
+  BLASLONG i, ii, jj;
 
   FLOAT *ao;
 
   jj = offset;
-  int js = 0;
 #ifdef DOUBLE
+  int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
+  int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
   svbool_t pn = svwhilelt_b32(js, n);
   int n_active = svcntp_b32(svptrue_b32(), pn);
@@ -74,25 +75,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
       if (ii == jj) {
         for (int j = 0; j < n_active; j++) {
           for (int k = 0; k < j; k++) {
-            *(b + j * n_active + k) = *(a + k * lda + j);
+            *(b + j * n_active + k) = *(ao + k * lda + j);
           }
-          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
         }
-      }
-
-      if (ii > jj) {
-        for (int j = 0; j < n_active; j++) {
+        ao += n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
           svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
           svst1(pn, b, aj_vec);
-          ao++;
         }
-
+        ao++;
+        b += n_active;
+        i++;
+        ii++;
       }
-
-      b += n_active * n_active;
-
-      i += n_active;
-      ii += n_active;
     } while (i < m);
 
 
diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
index 9012f7fe5..d7b2a4e8d 100644
--- a/kernel/arm64/trsm_ltcopy_sve.c
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -48,18 +48,17 @@
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
 
-  BLASLONG i, ii, j, jj;
+  BLASLONG i, ii, jj;
 
   FLOAT *ao;
 
   jj = offset;
-  int js = 0;
 #ifdef DOUBLE
-  svint64_t index = svindex_s64(0LL, lda);
+  int64_t js = 0;
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-  svint32_t index = svindex_s32(0, lda);
+  int32_t js = 0;
   svbool_t pn = svwhilelt_b32(js, n);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
@@ -73,26 +72,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
       if (ii == jj) {
         for (int j = 0; j < n_active; j++) {
-          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
           for (int k = j+1; k < n_active; k++) {
-            *(b + j * n_active + k) = *(a + j * lda + k);
+            *(b + j * n_active + k) = *(ao + j * lda + k);
           }
         }
-      }
-
-      if (ii < jj) {
-        for (int j = 0; j < n_active; j++) {
+        b += n_active * n_active;
+        ao += lda * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
           svfloat64_t aj_vec = svld1(pn, ao);
           svst1(pn, b, aj_vec);
-          ao += lda;
         }
-
+        ao += lda;
+        b += n_active;
+        i ++;
+        ii ++;
       }
-
-      b += n_active * n_active;
-
-      i += n_active;
-      ii += n_active;
     } while (i < m);
 
 
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
index 242e99f60..b2851452b 100644
--- a/kernel/arm64/trsm_uncopy_sve.c
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -48,17 +48,18 @@
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
 
-  BLASLONG i, ii, j, jj;
+  BLASLONG i, ii, jj;
 
   FLOAT *ao;
 
   jj = offset;
-  int js = 0;
 #ifdef DOUBLE
+  int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
+  int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
   svbool_t pn = svwhilelt_b32(js, n);
   int n_active = svcntp_b32(svptrue_b32(), pn);
@@ -73,25 +74,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
       if (ii == jj) {
         for (int j = 0; j < n_active; j++) {
-          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
           for (int k = j+1; k < n_active; k++) {
-            *(b + j * n_active + k) = *(a + k * lda + j);
+            *(b + j * n_active + k) = *(ao + k * lda + j);
           }
         }
-      }
-
-      if (ii < jj) {
-        for (int j = 0; j < n_active; j++) {
+        ao += n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
           svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
           svst1(pn, b, aj_vec);
-          ao++;
         }
+        ao++;
+        b += n_active;
+        i++;
+        ii++;
       }
-
-      b += n_active * n_active;
-
-      i += n_active;
-      ii += n_active;
     } while (i < m);
 
 
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
index 9eefb8c18..558955801 100644
--- a/kernel/arm64/trsm_utcopy_sve.c
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -48,18 +48,17 @@
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
 
-  BLASLONG i, ii, j, jj;
+  BLASLONG i, ii, jj;
 
   FLOAT *ao;
 
   jj = offset;
-  int js = 0;
 #ifdef DOUBLE
-  svint64_t index = svindex_s64(0LL, lda);
+  int64_t js = 0;
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-  svint32_t index = svindex_s32(0, lda);
+  int32_t js = 0;
   svbool_t pn = svwhilelt_b32(js, n);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
@@ -74,25 +73,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
       if (ii == jj) {
         for (int j = 0; j < n_active; j++) {
           for (int k = 0; k < j; k++) {
-            *(b + j * n_active + k) = *(a + j * lda + k);
+            *(b + j * n_active + k) = *(ao + j * lda + k);
           }
-          *(b + j * n_active + j) = INV(*(a + j * lda + j));
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
         }
-      }
-
-      if (ii > jj) {
-        for (int j = 0; j < n_active; j++) {
+        ao += lda * n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
           svfloat64_t aj_vec = svld1(pn, ao);
           svst1(pn, b, aj_vec);
-          ao += lda;
         }
-
-      }
-
-      b += n_active * n_active;
-
-      i += n_active;
-      ii += n_active;
+        ao += lda;
+        b += n_active;
+        i ++;
+        ii ++;
+      } 
     } while (i < m);
 
 

From f1315288a8d9f4e06da7b7ccb9a37f04ded95c5f Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sat, 15 Jan 2022 22:27:25 +0100
Subject: [PATCH 47/77] add sve ztrsm

---
 kernel/arm64/KERNEL.A64FX         |  43 +++++++----
 kernel/arm64/trsm_kernel_LN_sve.c |   4 +
 kernel/arm64/trsm_kernel_LT_sve.c |   4 +
 kernel/arm64/trsm_kernel_RN_sve.c |   4 +
 kernel/arm64/trsm_kernel_RT_sve.c |   4 +
 kernel/arm64/trsm_lncopy_sve.c    |   9 ++-
 kernel/arm64/trsm_ltcopy_sve.c    |   9 ++-
 kernel/arm64/trsm_uncopy_sve.c    |   9 ++-
 kernel/arm64/trsm_utcopy_sve.c    |   9 ++-
 kernel/arm64/ztrsm_lncopy_sve.c   | 119 ++++++++++++++++++++++++++++++
 kernel/arm64/ztrsm_ltcopy_sve.c   | 115 +++++++++++++++++++++++++++++
 kernel/arm64/ztrsm_uncopy_sve.c   | 119 ++++++++++++++++++++++++++++++
 kernel/arm64/ztrsm_utcopy_sve.c   | 115 +++++++++++++++++++++++++++++
 13 files changed, 539 insertions(+), 24 deletions(-)
 create mode 100644 kernel/arm64/ztrsm_lncopy_sve.c
 create mode 100644 kernel/arm64/ztrsm_ltcopy_sve.c
 create mode 100644 kernel/arm64/ztrsm_uncopy_sve.c
 create mode 100644 kernel/arm64/ztrsm_utcopy_sve.c

diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index d74f0592d..bd25f7cd8 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -20,25 +20,36 @@ IDMAXKERNEL  = ../arm/imax.c
 ISMINKERNEL  = ../arm/imin.c
 IDMINKERNEL  = ../arm/imin.c
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+TRSMCOPYLN_M    = trsm_lncopy_sve.c
+TRSMCOPYLT_M    = trsm_ltcopy_sve.c
+TRSMCOPYUN_M    = trsm_uncopy_sve.c
+TRSMCOPYUT_M    = trsm_utcopy_sve.c
+
+CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
+ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
+ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
+ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
 
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 SAMAXKERNEL  = amax.S
 DAMAXKERNEL  = amax.S
diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c
index 57f79ac3a..fa1c6e984 100644
--- a/kernel/arm64/trsm_kernel_LN_sve.c
+++ b/kernel/arm64/trsm_kernel_LN_sve.c
@@ -167,7 +167,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
   BLASLONG i, j;
   FLOAT *aa, *cc;
   BLASLONG  kk;
+#ifdef DOUBLE
   int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
 
 #if 0
   fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c
index 8c6a57a6d..2cbb2aafb 100644
--- a/kernel/arm64/trsm_kernel_LT_sve.c
+++ b/kernel/arm64/trsm_kernel_LT_sve.c
@@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
   FLOAT *aa, *cc;
   BLASLONG  kk;
   BLASLONG i, j, jj;
+#ifdef DOUBLE
   int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
 
 #if 0
   fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c
index 2f6611c1c..5e4e8d9b1 100644
--- a/kernel/arm64/trsm_kernel_RN_sve.c
+++ b/kernel/arm64/trsm_kernel_RN_sve.c
@@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
   FLOAT *aa, *cc;
   BLASLONG  kk;
   BLASLONG i, j, jj;
+#ifdef DOUBLE
   int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
 
 #if 0
   fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c
index efafc9d11..c376c0e33 100644
--- a/kernel/arm64/trsm_kernel_RT_sve.c
+++ b/kernel/arm64/trsm_kernel_RT_sve.c
@@ -169,7 +169,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
   BLASLONG i, j;
   FLOAT *aa, *cc;
   BLASLONG  kk;
+#ifdef DOUBLE
   int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
 
 #if 0
   fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
index 7f480dcad..5a9d4194a 100644
--- a/kernel/arm64/trsm_lncopy_sve.c
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
+  int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, n);
+  svbool_t pn = svwhilelt_b32(js, N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
         ii += n_active;
       } else {
         if (ii > jj) {
+#ifdef DOUBLE
           svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+          svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
           svst1(pn, b, aj_vec);
         }
         ao++;
@@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, n);
+    pn = svwhilelt_b32(js, N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
index d7b2a4e8d..ac4019e26 100644
--- a/kernel/arm64/trsm_ltcopy_sve.c
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
+  int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, n);
+  svbool_t pn = svwhilelt_b32(js, N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
         ii += n_active;
       } else {
         if (ii < jj) {
+#ifdef DOUBLE
           svfloat64_t aj_vec = svld1(pn, ao);
+#else
+          svfloat32_t aj_vec = svld1(pn, ao);
+#endif
           svst1(pn, b, aj_vec);
         }
         ao += lda;
@@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, n);
+    pn = svwhilelt_b32(js, N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
index b2851452b..8fdcd0f4b 100644
--- a/kernel/arm64/trsm_uncopy_sve.c
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
+  int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, n);
+  svbool_t pn = svwhilelt_b32(js, N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
         ii += n_active;
       } else {
         if (ii < jj) {
+#ifdef DOUBLE
           svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+          svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
           svst1(pn, b, aj_vec);
         }
         ao++;
@@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, n);
+    pn = svwhilelt_b32(js, N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
index 558955801..0f5f0dccd 100644
--- a/kernel/arm64/trsm_utcopy_sve.c
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   svbool_t pn = svwhilelt_b64(js, n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
+  int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, n);
+  svbool_t pn = svwhilelt_b32(js, N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
         ii += n_active;
       } else {
         if (ii > jj) {
+#ifdef DOUBLE
           svfloat64_t aj_vec = svld1(pn, ao);
+#else
+          svfloat32_t aj_vec = svld1(pn, ao);
+#endif
           svst1(pn, b, aj_vec);
         }
         ao += lda;
@@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, n);
+    pn = svwhilelt_b32(js, N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c
new file mode 100644
index 000000000..eb7cd0294
--- /dev/null
+++ b/kernel/arm64/ztrsm_lncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
+          }
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += n_active * 2;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+          svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+          svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+        }
+        ao += 2;
+        b += n_active * 2;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c
new file mode 100644
index 000000000..27cd1a941
--- /dev/null
+++ b/kernel/arm64/ztrsm_ltcopy_sve.c
@@ -0,0 +1,115 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
+          }
+        }
+        b += n_active * n_active * 2;
+        ao += lda * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+          svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+          svst2(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active * 2;
+        i ++;
+        ii ++;
+      }
+    } while (i < m);
+
+
+    a += n_active * 2;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c
new file mode 100644
index 000000000..92e086b75
--- /dev/null
+++ b/kernel/arm64/ztrsm_uncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
+          }
+        }
+        ao += n_active * 2;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+          svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+          svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+        }
+        ao += 2;
+        b += n_active * 2;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c
new file mode 100644
index 000000000..d82a9d0c8
--- /dev/null
+++ b/kernel/arm64/ztrsm_utcopy_sve.c
@@ -0,0 +1,115 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
+          }
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += lda * n_active * 2;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+          svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+          svst2(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active * 2;
+        i ++;
+        ii ++;
+      } 
+    } while (i < m);
+
+
+    a += n_active * 2;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}

From 0fb6cc07bf9fdf0cbe7a7595e82379a0040d9e9a Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sun, 16 Jan 2022 21:39:57 +0100
Subject: [PATCH 48/77] fix ztrsm lt/ut copy

---
 kernel/arm64/ztrsm_ltcopy_sve.c | 2 +-
 kernel/arm64/ztrsm_utcopy_sve.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c
index 27cd1a941..34dbf8a30 100644
--- a/kernel/arm64/ztrsm_ltcopy_sve.c
+++ b/kernel/arm64/ztrsm_ltcopy_sve.c
@@ -77,7 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
           }
         }
         b += n_active * n_active * 2;
-        ao += lda * n_active * 2;
+        ao += lda * n_active;
         i += n_active;
         ii += n_active;
       } else {
diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c
index d82a9d0c8..ccb942e1b 100644
--- a/kernel/arm64/ztrsm_utcopy_sve.c
+++ b/kernel/arm64/ztrsm_utcopy_sve.c
@@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
           compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
           //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
         }
-        ao += lda * n_active * 2;
+        ao += lda * n_active;
         b += n_active * n_active * 2;
         i += n_active;
         ii += n_active;

From b6a445cfd88ab0bfa1687aeba7cc2d6705497f77 Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Sun, 16 Jan 2022 21:40:56 +0100
Subject: [PATCH 49/77] adapt Makefile for SVE trsm

---
 kernel/Makefile.L3 | 128 +++++++++++++++++++++++++++++++++++++++++++++
 param.h            |   4 +-
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 2a10ac980..2d5740183 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -2391,29 +2391,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR
 $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
 	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
 
+ifdef TRSMCOPYUN_M
+$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLN_M
+$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYUT_M
+$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLT_M
+$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2439,29 +2471,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N
 $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef TRSMCOPYUN_M
+$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLN_M
+$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYUT_M
+$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLT_M
+$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2535,29 +2599,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N
 $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRSMCOPYUN_M
+$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLN_M
+$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYUT_M
+$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLT_M
+$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2583,29 +2679,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_
 $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRSMCOPYUN_M
+$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLN_M
+$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYUT_M
+$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLT_M
+$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
diff --git a/param.h b/param.h
index 5d46991a2..ab6eab6eb 100644
--- a/param.h
+++ b/param.h
@@ -3327,11 +3327,11 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 
 #define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  4
-#define CGEMM_DEFAULT_UNROLL_MN  32
+#define CGEMM_DEFAULT_UNROLL_MN  16
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  4
-#define ZGEMM_DEFAULT_UNROLL_MN  32
+#define ZGEMM_DEFAULT_UNROLL_MN  16
 
 #define SGEMM_DEFAULT_P	128
 #define DGEMM_DEFAULT_P	160

From 1b49ef8dcf6b01aecef30f804654d0efc97bc37a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jan 2022 00:05:33 +0100
Subject: [PATCH 50/77] Fix pivot index for negative increments

---
 lapack/laswp/generic/laswp_k_1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c
index 88648cf29..556889291 100644
--- a/lapack/laswp/generic/laswp_k_1.c
+++ b/lapack/laswp/generic/laswp_k_1.c
@@ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;

From 0e9e9513067665ec0a505ed935c89752a60dbb81 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jan 2022 00:06:41 +0100
Subject: [PATCH 51/77] Fix pivot offset calculation for negative incx

---
 lapack/laswp/generic/laswp_k_2.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c
index 93b9a2c01..f76cd078f 100644
--- a/lapack/laswp/generic/laswp_k_2.c
+++ b/lapack/laswp/generic/laswp_k_2.c
@@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;

From eca2f50b48a9941e2f3d2cd75fb699ace070f9cc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jan 2022 00:07:33 +0100
Subject: [PATCH 52/77] Fix pivot offset calculation for negative incx

---
 lapack/laswp/generic/laswp_k_4.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c
index 191a229a9..6520ed799 100644
--- a/lapack/laswp/generic/laswp_k_4.c
+++ b/lapack/laswp/generic/laswp_k_4.c
@@ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;

From afa0cece5cbca7ce9c749b3101ac36b15518508e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jan 2022 00:08:20 +0100
Subject: [PATCH 53/77] Fix pivot offset calculation for negative incx

---
 lapack/laswp/generic/laswp_k_8.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c
index 947941839..a7bf06817 100644
--- a/lapack/laswp/generic/laswp_k_8.c
+++ b/lapack/laswp/generic/laswp_k_8.c
@@ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
   ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;

From 3b6293f5a0e4371d81074ee0ebc19d173ca696ed Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jan 2022 00:09:14 +0100
Subject: [PATCH 54/77] Fix offset calculation for negative incx

---
 lapack/laswp/generic/zlaswp_k_1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c
index d1204778a..42aaed528 100644
--- a/lapack/laswp/generic/zlaswp_k_1.c
+++ b/lapack/laswp/generic/zlaswp_k_1.c
@@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
   lda *= 2;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;

From 57e2a72f40aaf008c48b7f0ec6e5216aee1499c4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jan 2022 00:10:21 +0100
Subject: [PATCH 55/77] Fix pivot offset calculation for negative incx

---
 lapack/laswp/generic/zlaswp_k_2.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c
index c18ab4bee..1220870f8 100644
--- a/lapack/laswp/generic/zlaswp_k_2.c
+++ b/lapack/laswp/generic/zlaswp_k_2.c
@@ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
   lda *= 2;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;

From 40003f8edb9e5c529c1c12589f9e1b53f9ac8f2d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jan 2022 00:11:18 +0100
Subject: [PATCH 56/77] Fix pivot offset calculation for negative incx

---
 lapack/laswp/generic/zlaswp_k_4.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c
index 45e1bf01e..cc7e296e1 100644
--- a/lapack/laswp/generic/zlaswp_k_4.c
+++ b/lapack/laswp/generic/zlaswp_k_4.c
@@ -69,10 +69,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
   lda *= 2;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;

From f158d59087c518fa924023d62a00eac176678dae Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Mon, 17 Jan 2022 22:36:48 +0100
Subject: [PATCH 57/77] adapt CMake

---
 kernel/CMakeLists.txt | 56 ++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 717c1ea72..8aa6728d5 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -381,23 +381,35 @@ endif ()
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED ZTRSMCOPYLN_M)
+  set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+  set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}")
+  set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}")
+  set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}")
+  set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}")
+endif ()
+      GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
 
@@ -491,23 +503,35 @@ endif ()
       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED TRSMCOPYLN_M)
+  set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+  set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}")
+  set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}")
+  set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}")
+  set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}")
+endif ()
+      GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
 

From 19d435b1b3a5d0d5719189ba29b13e728a2bb41c Mon Sep 17 00:00:00 2001
From: Bine Brank <binebrank@gmail.com>
Date: Tue, 18 Jan 2022 08:28:31 +0100
Subject: [PATCH 58/77] update armv8sve + contributors

---
 CONTRIBUTORS.md              |  1 +
 kernel/arm64/KERNEL.ARMV8SVE | 47 ++++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 879aaebe3..5378c79bf 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -203,3 +203,4 @@ In chronological order:
   * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
   * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions
   * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions
+  * [2022-01-18] SVE kernels and copy functions for TRSM
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index 66de642a5..bd25f7cd8 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -20,25 +20,36 @@ IDMAXKERNEL  = ../arm/imax.c
 ISMINKERNEL  = ../arm/imin.c
 IDMINKERNEL  = ../arm/imin.c
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+TRSMCOPYLN_M    = trsm_lncopy_sve.c
+TRSMCOPYLT_M    = trsm_ltcopy_sve.c
+TRSMCOPYUN_M    = trsm_uncopy_sve.c
+TRSMCOPYUT_M    = trsm_utcopy_sve.c
+
+CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
+ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
+ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
+ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
 
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 SAMAXKERNEL  = amax.S
 DAMAXKERNEL  = amax.S
@@ -140,8 +151,8 @@ DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 
 DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
 DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
 
 DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)

From 00f44bfff74e7173a881c4d6849deb75b9dfbd6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Fri, 21 Jan 2022 13:27:17 +0100
Subject: [PATCH 59/77] cmake: Check if Fortran compiler is usable before
 enabling it.

---
 cmake/f_check.cmake | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake
index 0f5d0e15d..14683ed21 100644
--- a/cmake/f_check.cmake
+++ b/cmake/f_check.cmake
@@ -20,19 +20,16 @@
 # NEEDBUNDERSCORE
 # NEED2UNDERSCORES
 
-if (NOT NO_LAPACK)
-  include(CheckLanguage)
-  check_language(Fortran)
-  if(CMAKE_Fortran_COMPILER)
-    enable_language(Fortran)
-  else()
-  message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+else()
+  if (NOT NO_LAPACK)
+    message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
+  endif()
   set (NOFORTRAN 1)
   set (NO_LAPACK 1)
-  endif()
-else()
-  include(CMakeForceCompiler)
-  CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
 endif()
 
 if (NOT ONLY_CBLAS)

From 1937b4e435cce48dbf8d7d124800e03e1ba5d30d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jan 2022 18:27:38 +0100
Subject: [PATCH 60/77] Add Elbrus e2k architecture detection

---
 c_check | 7 +++++++
 ctest.c | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/c_check b/c_check
index 030f5e632..999f5a7a7 100644
--- a/c_check
+++ b/c_check
@@ -84,6 +84,7 @@ $os = Haiku           if ($data =~ /OS_HAIKU/);
 
 $architecture = x86          if ($data =~ /ARCH_X86/);
 $architecture = x86_64       if ($data =~ /ARCH_X86_64/);
+$architecture = e2k          if ($data =~ /ARCH_E2K/);
 $architecture = power        if ($data =~ /ARCH_POWER/);
 $architecture = mips         if ($data =~ /ARCH_MIPS/);
 $architecture = mips64       if ($data =~ /ARCH_MIPS64/);
@@ -124,6 +125,11 @@ if ($architecture eq "zarch") {
     $binary = 64;
 }
 
+if ($architecture eq "e2k") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($architecture eq "alpha") {
     $defined = 1;
     $binary = 64;
@@ -223,6 +229,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
 
 $architecture = x86          if ($data =~ /ARCH_X86/);
 $architecture = x86_64       if ($data =~ /ARCH_X86_64/);
+$architecture = e2k          if ($data =~ /ARCH_E2K/);
 $architecture = power        if ($data =~ /ARCH_POWER/);
 $architecture = mips         if ($data =~ /ARCH_MIPS/);
 $architecture = mips64       if ($data =~ /ARCH_MIPS64/);
diff --git a/ctest.c b/ctest.c
index 2afd93f68..fc52b43a6 100644
--- a/ctest.c
+++ b/ctest.c
@@ -165,3 +165,7 @@ ARCH_LOONGARCH64
 HAVE_C11
 #endif
 
+#if defined(__e2k__)
+ARCH_E2K
+#endif
+

From bc93f468ef98c7bb76bdcaf779e9dbe7231303b4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jan 2022 18:53:38 +0100
Subject: [PATCH 61/77] Add Elbrus E2000 architecture as generic x86_64
 compatible

---
 Makefile.e2k   |  1 +
 TargetList.txt |  4 ++++
 common.h       |  4 ++++
 common_e2k.h   | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 common_macro.h |  2 +-
 getarch.c      | 11 +++++++++
 6 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 Makefile.e2k
 create mode 100644 common_e2k.h

diff --git a/Makefile.e2k b/Makefile.e2k
new file mode 100644
index 000000000..a5e50b1f0
--- /dev/null
+++ b/Makefile.e2k
@@ -0,0 +1 @@
+COPT	= -Wall -O2 # -DGEMMTEST
diff --git a/TargetList.txt b/TargetList.txt
index 97c8a8f06..a5a07a661 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -115,3 +115,7 @@ C910V
 
 11.LOONGARCH64:
 LOONGSON3R5
+
+12. Elbrus E2000:
+E2K
+
diff --git a/common.h b/common.h
index ff5254a5c..00d1d0baf 100644
--- a/common.h
+++ b/common.h
@@ -474,6 +474,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_loongarch64.h"
 #endif
 
+#ifdef ARCH_E2K
+#include "common_e2k.h"
+#endif
+
 #ifndef ASSEMBLER
 #ifdef OS_WINDOWSSTORE
 typedef char env_var_t[MAX_PATH];
diff --git a/common_e2k.h b/common_e2k.h
new file mode 100644
index 000000000..0739c9473
--- /dev/null
+++ b/common_e2k.h
@@ -0,0 +1,64 @@
+/*****************************************************************************
+Copyright (c) 2011-2016, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#ifndef COMMON_E2K
+#define COMMON_E2K
+
+#ifdef ASSEMBLER
+#error
+#endif
+
+#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define RMB
+
+#define INLINE __attribute__((__always_inline__)) inline
+
+static inline int blas_quickdivide(blasint x, blasint y) {
+  return x / y;
+}
+
+#ifndef PAGESIZE
+#define PAGESIZE	( 4 << 10)
+#endif
+#define HUGE_PAGESIZE	( 2 << 20)
+
+#ifndef BUFFERSIZE
+#define BUFFER_SIZE	(32 << 20)
+#else
+#define BUFFER_SIZE	(32 << BUFFERSIZE)
+#endif
+
+#define SEEK_ADDRESS
+
+#endif
+
diff --git a/common_macro.h b/common_macro.h
index cf2a3fd88..9826f1809 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -2611,7 +2611,7 @@
 
 #ifndef ASSEMBLER
 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
-|| defined(ARCH_LOONGARCH64)
+|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)
 extern BLASLONG gemm_offset_a;
 extern BLASLONG gemm_offset_b;
 extern BLASLONG sbgemm_p;
diff --git a/getarch.c b/getarch.c
index 73bbf1892..00e544bc7 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1536,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#if defined(FORCE_E2K) || defined(__e2k__)
+#define FORCE
+#define ARCHITECTURE "E2K"
+#define ARCHCONFIG   "-DGENERIC " \
+		     "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "generic"
+#define CORENAME  "generic"
+#endif
+
 #ifndef FORCE
 
 #ifdef USER_TARGET

From 898cf5faf3fa3eaa6566c45276f7c6ba08082318 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jan 2022 18:55:10 +0100
Subject: [PATCH 62/77] Add Elbrus e2k architecture support

---
 kernel/Makefile.L3 | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 2d5740183..bea6cb048 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -617,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
 $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
 
+ifeq ($(ARCH), E2K)
+USE_TRMM = 1
+endif
+
 
 ifeq ($(BUILD_BFLOAT16), 1)
 

From 3492bea60225d795deb4e1b507914482133fc6a4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jan 2022 18:57:28 +0100
Subject: [PATCH 63/77] Create Makefile

---
 kernel/e2k/Makefile | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 kernel/e2k/Makefile

diff --git a/kernel/e2k/Makefile b/kernel/e2k/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/e2k/Makefile
@@ -0,0 +1 @@
+clean ::

From 299d4d70a371c9fed9792daeb80329fd7961f841 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jan 2022 18:59:36 +0100
Subject: [PATCH 64/77] Add default KERNEL file for Elbrus E2K arch

---
 kernel/e2k/KERNEL | 149 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 kernel/e2k/KERNEL

diff --git a/kernel/e2k/KERNEL b/kernel/e2k/KERNEL
new file mode 100644
index 000000000..afa8a0881
--- /dev/null
+++ b/kernel/e2k/KERNEL
@@ -0,0 +1,149 @@
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../arm/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+LSAME_KERNEL	= ../generic/lsame.c
+
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+

From 66a15e15a87a7e89d7341006edd013f3b2843468 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jan 2022 19:02:57 +0100
Subject: [PATCH 65/77] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 5378c79bf..7e23dec8b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -204,3 +204,6 @@ In chronological order:
   * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions
   * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions
   * [2022-01-18] SVE kernels and copy functions for TRSM
+
+* Ilya Kurdyukov <https://github.com/ilyakordyukov>
+  * [2021-02-21] Add basic support for the Elbrus E2000 architecture

From 5d24f3d2102270e0cdc00823a06a35c2993bc361 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jan 2022 19:09:00 +0100
Subject: [PATCH 66/77] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7e23dec8b..92be1fe42 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -205,5 +205,5 @@ In chronological order:
   * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions
   * [2022-01-18] SVE kernels and copy functions for TRSM
 
-* Ilya Kurdyukov <https://github.com/ilyakordyukov>
+* Ilya Kurdyukov <https://github.com/ilyakurdyukov>
   * [2021-02-21] Add basic support for the Elbrus E2000 architecture

From addc2a7aaa46eb1501a7c9c153951051eb82442d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 27 Jan 2022 19:56:32 +0100
Subject: [PATCH 67/77] Add proper defaults for IMIN/IMAX

---
 kernel/sparc/KERNEL | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL
index 594fd05e5..a8c958bb4 100644
--- a/kernel/sparc/KERNEL
+++ b/kernel/sparc/KERNEL
@@ -39,11 +39,19 @@ IZAMINKERNEL = izamax.S
 endif
 
 ifndef ISMINKERNEL
-ISMINKERNEL = iamax.S
+ISMINKERNEL = imax.S
 endif
 
 ifndef IDMINKERNEL
-IDMINKERNEL = iamax.S
+IDMINKERNEL = imax.S
+endif
+
+ifndef ISMAXKERNEL
+ISMAXKERNEL = imax.S
+endif
+
+ifndef IDMAXKERNEL
+IDMAXKERNEL = imax.S
 endif
 
 ifndef SNRM2KERNEL

From 7f0b11fbc189e95c8ee2fd249980962b9f5a1125 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 27 Jan 2022 22:00:39 +0100
Subject: [PATCH 68/77] Exclude some complex drivers when NO_LAPACK is set

---
 driver/level2/Makefile | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/driver/level2/Makefile b/driver/level2/Makefile
index caecf4f97..9bef6e2a5 100644
--- a/driver/level2/Makefile
+++ b/driver/level2/Makefile
@@ -64,9 +64,9 @@ CBLASOBJS += \
 	chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \
 	chpr_U.$(SUFFIX)  chpr_L.$(SUFFIX)  chpr_V.$(SUFFIX)  chpr_M.$(SUFFIX) \
 	chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \
-	csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
-	cspr_U.$(SUFFIX)  cspr_L.$(SUFFIX)  cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
-	csyr_U.$(SUFFIX)  csyr_L.$(SUFFIX)  csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
+	csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \
+	cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
+	csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
 	ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \
 	ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \
 	ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \
@@ -92,6 +92,13 @@ CBLASOBJS += \
 	ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \
 	ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX)
 
+ifndef NO_LAPACK
+CBLASOBJS += \
+	cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
+	cspr_U.$(SUFFIX)  cspr_L.$(SUFFIX)  \
+	csyr_U.$(SUFFIX)  csyr_L.$(SUFFIX)  
+endif
+
 ZBLASOBJS += \
 	zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \
 	zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \

From d2b5fbf80f02539243cca20b496b0358d2829420 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 27 Jan 2022 22:02:08 +0100
Subject: [PATCH 69/77] Exclude some complex (LAPACK) functions when NO_LAPACK
 is set

---
 interface/CMakeLists.txt | 19 +++++++++++++++----
 interface/Makefile       |  8 ++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index ccb5fce3f..0b2998237 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES
 # these all have 'z' sources for complex versions
 set(BLAS2_SOURCES
   gemv.c ger.c
-  trsv.c trmv.c symv.c
-  syr.c syr2.c gbmv.c
-  sbmv.c spmv.c
-  spr.c spr2.c
+  trsv.c trmv.c 
+  syr2.c gbmv.c
+  sbmv.c 
+  spr2.c
   tbsv.c tbmv.c
   tpsv.c tpmv.c
 )
 
+set(BLAS2_REAL_ONLY_SOURCES
+  symv.c syr.c spmv.c spr.c
+)
+set(BLAS2_COMPLEX_LAPACK_SOURCES
+  symv.c syr.c spmv.c spr.c
+)
+
 set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
   hemv.c hbmv.c
   her.c her2.c
@@ -78,6 +85,10 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
   GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1)
   GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
   GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
+  GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1)
+  if (NOT DEFINED NO_LAPACK)
+  GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
+  endif ()
   GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4)
   GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
   GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
diff --git a/interface/Makefile b/interface/Makefile
index 3252601d2..f57d0bda0 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c
 qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c
 qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c
 qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c
 qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)

From a3eea3e127fb9f3682e1e132c75b515d7b7d5241 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Feb 2022 11:43:17 +0100
Subject: [PATCH 70/77] Fix input argument check (LAPACK PR 646)

---
 lapack-netlib/SRC/cgeqrt2.f | 11 ++++-------
 lapack-netlib/SRC/dgeqrt2.f | 11 ++++-------
 lapack-netlib/SRC/sgeqrt2.f | 11 ++++-------
 lapack-netlib/SRC/zgeqrt2.f | 11 ++++-------
 4 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/lapack-netlib/SRC/cgeqrt2.f b/lapack-netlib/SRC/cgeqrt2.f
index 9ee3e4f79..11221636d 100644
--- a/lapack-netlib/SRC/cgeqrt2.f
+++ b/lapack-netlib/SRC/cgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup complexGEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
diff --git a/lapack-netlib/SRC/dgeqrt2.f b/lapack-netlib/SRC/dgeqrt2.f
index 138dd4d9c..00f800d43 100644
--- a/lapack-netlib/SRC/dgeqrt2.f
+++ b/lapack-netlib/SRC/dgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup doubleGEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
diff --git a/lapack-netlib/SRC/sgeqrt2.f b/lapack-netlib/SRC/sgeqrt2.f
index 349fd4b60..f6532f812 100644
--- a/lapack-netlib/SRC/sgeqrt2.f
+++ b/lapack-netlib/SRC/sgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup realGEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
diff --git a/lapack-netlib/SRC/zgeqrt2.f b/lapack-netlib/SRC/zgeqrt2.f
index bad708498..34d9d544f 100644
--- a/lapack-netlib/SRC/zgeqrt2.f
+++ b/lapack-netlib/SRC/zgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup complex16GEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN

From aec32e5bd4cdc6d69a04000ae9530983eec0e756 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 5 Feb 2022 22:39:03 +0100
Subject: [PATCH 71/77] Update azure-pipelines.yml

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 710940924..04ed428de 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -224,7 +224,7 @@ jobs:
 
 - job: OSX_IOS_ARMV8
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   variables:
      CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
      CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0

From f7e8f9ec57dcbe7c3a94a18575f0379dfe828dae Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 7 Feb 2022 00:00:15 +0100
Subject: [PATCH 72/77] Support AVX512-enabled AlderLake

---
 cpuid_x86.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 6466bd148..d7d85eb20 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1495,6 +1495,10 @@ int get_cpuname(void){
         switch (model) {
         case 7: // Alder Lake desktop
         case 10: // Alder Lake mobile
+	  if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;	
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
           if(support_avx2())
             return CPUTYPE_HASWELL;
           if(support_avx())

From fa3e9f25e633d5eb735e9183dfa72b6ed09fee0e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 7 Feb 2022 00:00:56 +0100
Subject: [PATCH 73/77] Support AVX512-enabled Alder Lake

---
 driver/others/dynamic.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index b12fb069a..52a7c6087 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -708,8 +708,11 @@ static gotoblas_t *get_coretype(void){
 	
       case 9:
         if (model == 7 || model == 10) { // Alder Lake
+	   if(support_avx512_bf16())
+             return &gotoblas_COOPERLAKE;
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
           if(support_avx2()){
-            openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
             return &gotoblas_HASWELL;
           }
           if(support_avx()) {

From e2bf3f31a6e75223d864ffeb39c12bb3c68393e3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Feb 2022 22:09:25 +0100
Subject: [PATCH 74/77] Add .NOTPARALLEL: as a workaround for builds on DFS

---
 lapack-netlib/TESTING/MATGEN/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile
index e21ebd6c3..0b94e3aaa 100644
--- a/lapack-netlib/TESTING/MATGEN/Makefile
+++ b/lapack-netlib/TESTING/MATGEN/Makefile
@@ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \
 endif
 
 .PHONY: all
+.NOTPARALLEL:
 all: $(TMGLIB)
 
 ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \

From 0e04710099df5dd9369d49d435488c6f3705691a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Feb 2022 23:03:05 +0100
Subject: [PATCH 75/77] filter out libflangmain as well

---
 f_check | 1 +
 1 file changed, 1 insertion(+)

diff --git a/f_check b/f_check
index 4825fb09a..71293b53f 100644
--- a/f_check
+++ b/f_check
@@ -361,6 +361,7 @@ if ($link ne "") {
 	    ($flags =~ /^\-l/)
 	    && ($flags !~ /ibrary/)
 	    && ($flags !~ /gfortranbegin/)
+	    && ($flags !~ /flangmain/)
 	    && ($flags !~ /frtbegin/)
 	    && ($flags !~ /pathfstart/)
 	    && ($flags !~ /crt[0-9]/)

From db7a03dd4c414c8053090bf5bcc18f0fc8e01095 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Feb 2022 23:04:45 +0100
Subject: [PATCH 76/77] keep flang-classic on MacOS from trying to create an
 executable instead of a library

---
 exports/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/exports/Makefile b/exports/Makefile
index 903836dd6..baaa33623 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -142,10 +142,14 @@ ifneq (,$(filter 1 2,$(NOFORTRAN)))
 else
 ifeq ($(F_COMPILER), INTEL)
 	$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
+else
+ifeq ($(F_COMPILER), FLANG)
+	$(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 else
 	$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 endif
 endif
+endif
 
 dllinit.$(SUFFIX) : dllinit.c
 	$(CC) $(CFLAGS) -c -o $(@F) -s $<

From c352ac0ae3593a30262b20d54f95c19f517b56a1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Feb 2022 22:16:04 +0100
Subject: [PATCH 77/77] Update with 0.3.20 changes

---
 Changelog.txt | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index 180f7adec..97af4cbd9 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,39 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.20
+ 20-Feb-2022
+
+general:
+ - some code cleanup, with added casts etc.
+ - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset
+ - fixed pivot index calculation by ?LASWP for negative increments other than one
+ - fixed input argument check in LAPACK ? GEQRT2
+ - improved the check for a Fortran compiler in CMAKE builds
+ - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1
+ - fixed building of LAPACK on certain distributed filesystems with parallel gmake
+ - fixed building the shared library on MacOS with classic flang
+
+x86_64:
+ - fixed cross-compilation with CMAKE for CORE2 target
+ - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds
+ - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS
+
+E2K:
+ - add new architecture (Russian Elbrus E2000 family)
+
+SPARC:
+ - fix IMIN/IMAX
+
+ARMV8:
+ - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX
+ - added support for Neoverse N2 and V1 cpus
+
+MIPS,MIPS64:
+ - fixed autodetection of MSA capability
+
+LOONGARCH64:
+ - added an optimized DGEMM kernel
+
 ====================================================================
 Version 0.3.19
  19-Dec-2021