Merge pull request #3824 from martin-frbg/issue3822

Make line endings consistently LF
2022-11-17 16:38:11 +01:00 · 2022-11-17 16:38:11 +01:00 · 76ae221330
parent 26b5009ebc f92dd6e303
commit 76ae221330
58 changed files with 93233 additions and 93233 deletions
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
--- a/benchmark/amax.c
+++ b/benchmark/amax.c
@ -1,133 +1,133 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef AMAX
-
-#ifdef COMPLEX
-#ifdef DOUBLE
-#define AMAX BLASFUNC(dzamax)
-#else
-#define AMAX BLASFUNC(scamax)
-#endif
-#else
-#ifdef DOUBLE
-#define AMAX BLASFUNC(damax)
-#else
-#define AMAX BLASFUNC(samax)
-#endif
-#endif
-
-int main(int argc, char *argv[])
-{
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x = 1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from = 1;
-  int to = 200;
-  int step = 1;
-
-  double time1, timeg;
-
-  argc--;
-  argv++;
-
-  if (argc > 0)
-  {
-    from = atol(*argv);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    to = MAX(atol(*argv), from);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    step = atol(*argv);
-    argc--;
-    argv++;
-  }
-
-  if ((p = getenv("OPENBLAS_LOOPS")))
-    loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))
-    inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
-
-  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
-  {
-    fprintf(stderr, "Out of Memory!!\n");
-    exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for (m = from; m <= to; m += step)
-  {
-
-    timeg = 0;
-    fprintf(stderr, " %6d : ", (int)m);
-
-    for (l = 0; l < loops; l++)
-    {
-
-      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
-      {
-        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-      }
-
-      begin();
-      AMAX(&m, x, &inc_x);
-      end();
-      timeg += getsec();
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-            " %10.2f MFlops %10.6f sec\n",
-            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef AMAX
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define AMAX BLASFUNC(dzamax)
+#else
+#define AMAX BLASFUNC(scamax)
+#endif
+#else
+#ifdef DOUBLE
+#define AMAX BLASFUNC(damax)
+#else
+#define AMAX BLASFUNC(samax)
+#endif
+#endif
+
+int main(int argc, char *argv[])
+{
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x = 1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from = 1;
+  int to = 200;
+  int step = 1;
+
+  double time1, timeg;
+
+  argc--;
+  argv++;
+
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
+
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
+
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for (m = from; m <= to; m += step)
+  {
+
+    timeg = 0;
+    fprintf(stderr, " %6d : ", (int)m);
+
+    for (l = 0; l < loops; l++)
+    {
+
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
+
+      begin();
+      AMAX(&m, x, &inc_x);
+      end();
+      timeg += getsec();
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/amin.c
+++ b/benchmark/amin.c
@ -1,137 +1,137 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef AMIN
-
-#ifdef COMPLEX
-#ifdef DOUBLE
-#define AMIN BLASFUNC(dzamin)
-#else
-#define AMIN BLASFUNC(scamin)
-#endif
-#else
-#ifdef DOUBLE
-#define AMIN BLASFUNC(damin)
-#else
-#define AMIN BLASFUNC(samin)
-#endif
-#endif
-
-int main(int argc, char *argv[])
-{
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x = 1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from = 1;
-  int to = 200;
-  int step = 1;
-
-  double time1, timeg;
-
-  argc--;
-  argv++;
-
-  if (argc > 0)
-  {
-    from = atol(*argv);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    to = MAX(atol(*argv), from);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    step = atol(*argv);
-    argc--;
-    argv++;
-  }
-
-  if ((p = getenv("OPENBLAS_LOOPS")))
-    loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))
-    inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
-
-  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
-  {
-    fprintf(stderr, "Out of Memory!!\n");
-    exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for (m = from; m <= to; m += step)
-  {
-
-    timeg = 0;
-
-    fprintf(stderr, " %6d : ", (int)m);
-
-    for (l = 0; l < loops; l++)
-    {
-
-      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
-      {
-        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-      }
-
-      begin();
-
-      AMIN(&m, x, &inc_x);
-
-      end();
-
-      timeg += getsec();
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-            " %10.2f MFlops %10.6f sec\n",
-            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef AMIN
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define AMIN BLASFUNC(dzamin)
+#else
+#define AMIN BLASFUNC(scamin)
+#endif
+#else
+#ifdef DOUBLE
+#define AMIN BLASFUNC(damin)
+#else
+#define AMIN BLASFUNC(samin)
+#endif
+#endif
+
+int main(int argc, char *argv[])
+{
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x = 1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from = 1;
+  int to = 200;
+  int step = 1;
+
+  double time1, timeg;
+
+  argc--;
+  argv++;
+
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
+
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
+
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for (m = from; m <= to; m += step)
+  {
+
+    timeg = 0;
+
+    fprintf(stderr, " %6d : ", (int)m);
+
+    for (l = 0; l < loops; l++)
+    {
+
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
+
+      begin();
+
+      AMIN(&m, x, &inc_x);
+
+      end();
+
+      timeg += getsec();
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/hbmv.c
+++ b/benchmark/hbmv.c
@ -1,134 +1,134 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef HBMV
-
-#ifdef DOUBLE
-#define HBMV   BLASFUNC(zhbmv)
-#else
-#define HBMV   BLASFUNC(chbmv)
-#endif
-
-int main(int argc, char *argv[]){
-
-    FLOAT *a, *x, *y;
-    FLOAT alpha[] = {1.0, 1.0};
-    FLOAT beta [] = {0.0, 0.0};
-    blasint k = 1;
-    char uplo='L';
-    blasint m, i, j;
-    blasint inc_x=1, inc_y=1;
-    int loops = 1;
-    int l;
-    char *p;
-
-    int from =   1;
-    int to   = 200;
-    int step =   1;
-
-    double time1,timeg;
-
-    argc--;argv++;
-
-    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
-    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
-    if ((p = getenv("OPENBLAS_K")))      k = atoi(p);
-    
-    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", 
-                            from, to, step, uplo, k, inc_x, inc_y, loops);
-
-    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-#ifdef __linux
-    srandom(getpid());
-#endif
-
-    fprintf(stderr, "   SIZE       Flops\n");
-
-    for(m = from; m <= to; m += step) {
-
-        timeg=0;
-
-        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
-
-        for(j = 0; j < m; j++) {
-            for(i = 0; i < m * COMPSIZE; i++) {
-                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-            }
-        }
-
-    for (l = 0; l < loops; l++) {
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
-            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
-            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        begin();
-
-        HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-
-        end();
-
-        timeg += getsec();
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr, " %10.2f MFlops\n",
-            COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6);
-    }
-
-    return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef HBMV
+
+#ifdef DOUBLE
+#define HBMV   BLASFUNC(zhbmv)
+#else
+#define HBMV   BLASFUNC(chbmv)
+#endif
+
+int main(int argc, char *argv[]){
+
+    FLOAT *a, *x, *y;
+    FLOAT alpha[] = {1.0, 1.0};
+    FLOAT beta [] = {0.0, 0.0};
+    blasint k = 1;
+    char uplo='L';
+    blasint m, i, j;
+    blasint inc_x=1, inc_y=1;
+    int loops = 1;
+    int l;
+    char *p;
+
+    int from =   1;
+    int to   = 200;
+    int step =   1;
+
+    double time1,timeg;
+
+    argc--;argv++;
+
+    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
+    if ((p = getenv("OPENBLAS_K")))      k = atoi(p);
+    
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", 
+                            from, to, step, uplo, k, inc_x, inc_y, loops);
+
+    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+#ifdef __linux
+    srandom(getpid());
+#endif
+
+    fprintf(stderr, "   SIZE       Flops\n");
+
+    for(m = from; m <= to; m += step) {
+
+        timeg=0;
+
+        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
+
+        for(j = 0; j < m; j++) {
+            for(i = 0; i < m * COMPSIZE; i++) {
+                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+            }
+        }
+
+    for (l = 0; l < loops; l++) {
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
+            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
+            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        begin();
+
+        HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+
+        end();
+
+        timeg += getsec();
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr, " %10.2f MFlops\n",
+            COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6);
+    }
+
+    return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/hpmv.c
+++ b/benchmark/hpmv.c
@ -1,133 +1,133 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef HPMV
-
-#ifdef DOUBLE
-#define HPMV   BLASFUNC(zhpmv)
-#else
-#define HPMV   BLASFUNC(chpmv)
-#endif
-
-int main(int argc, char *argv[]){
-
-    FLOAT *a, *x, *y;
-    FLOAT alpha[] = {1.0, 1.0};
-    FLOAT beta [] = {1.0, 1.0};
-    char uplo='L';
-    blasint m, i, j;
-    blasint inc_x=1, inc_y=1;
-    int loops = 1;
-    int l;
-    char *p;
-
-    int from =   1;
-    int to   = 200;
-    int step =   1;
-
-    double time1,timeg;
-
-    argc--;argv++;
-
-    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
-    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
-
-    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
-
-    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-#ifdef __linux
-    srandom(getpid());
-#endif
-
-    fprintf(stderr, "   SIZE       Flops\n");
-
-    for(m = from; m <= to; m += step) {
-
-        timeg=0;
-
-        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
-
-        for(j = 0; j < m; j++) {
-            for(i = 0; i < m * COMPSIZE; i++) {
-                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-            }
-        }
-
-    for (l = 0; l < loops; l++) {
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
-            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
-            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        begin();
-
-        HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
-
-        end();
-
-        time1 = getsec();
-
-        timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr, " %10.2f MFlops\n",
-            COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
-    }
-
-    return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef HPMV
+
+#ifdef DOUBLE
+#define HPMV   BLASFUNC(zhpmv)
+#else
+#define HPMV   BLASFUNC(chpmv)
+#endif
+
+int main(int argc, char *argv[]){
+
+    FLOAT *a, *x, *y;
+    FLOAT alpha[] = {1.0, 1.0};
+    FLOAT beta [] = {1.0, 1.0};
+    char uplo='L';
+    blasint m, i, j;
+    blasint inc_x=1, inc_y=1;
+    int loops = 1;
+    int l;
+    char *p;
+
+    int from =   1;
+    int to   = 200;
+    int step =   1;
+
+    double time1,timeg;
+
+    argc--;argv++;
+
+    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
+
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
+
+    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+#ifdef __linux
+    srandom(getpid());
+#endif
+
+    fprintf(stderr, "   SIZE       Flops\n");
+
+    for(m = from; m <= to; m += step) {
+
+        timeg=0;
+
+        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
+
+        for(j = 0; j < m; j++) {
+            for(i = 0; i < m * COMPSIZE; i++) {
+                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+            }
+        }
+
+    for (l = 0; l < loops; l++) {
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
+            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
+            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        begin();
+
+        HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
+
+        end();
+
+        time1 = getsec();
+
+        timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr, " %10.2f MFlops\n",
+            COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
+    }
+
+    return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/iamin.c
+++ b/benchmark/iamin.c
@ -1,120 +1,120 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef IAMIN
-
-#ifdef COMPLEX
-#ifdef DOUBLE
-#define IAMIN   BLASFUNC(izamin)
-#else
-#define IAMIN   BLASFUNC(icamin)
-#endif
-#else
-#ifdef DOUBLE
-#define IAMIN   BLASFUNC(idamin)
-#else
-#define IAMIN   BLASFUNC(isamin)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	IAMIN (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
-  }
-  
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef IAMIN
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define IAMIN   BLASFUNC(izamin)
+#else
+#define IAMIN   BLASFUNC(icamin)
+#endif
+#else
+#ifdef DOUBLE
+#define IAMIN   BLASFUNC(idamin)
+#else
+#define IAMIN   BLASFUNC(isamin)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	IAMIN (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+
+  }
+  
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/imax.c
+++ b/benchmark/imax.c
@ -1,114 +1,114 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef IMAX
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define IMAX   BLASFUNC(idmax)
-#else
-#define IMAX   BLASFUNC(ismax)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	IMAX (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef IMAX
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define IMAX   BLASFUNC(idmax)
+#else
+#define IMAX   BLASFUNC(ismax)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	IMAX (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/imin.c
+++ b/benchmark/imin.c
@ -1,114 +1,114 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef IMIN
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define IMIN   BLASFUNC(idmin)
-#else
-#define IMIN   BLASFUNC(ismin)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	IMIN (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef IMIN
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define IMIN   BLASFUNC(idmin)
+#else
+#define IMIN   BLASFUNC(ismin)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	IMIN (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/max.c
+++ b/benchmark/max.c
@ -1,113 +1,113 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef NAMAX
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define NAMAX   BLASFUNC(dmax)
-#else
-#define NAMAX   BLASFUNC(smax)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	NAMAX (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef NAMAX
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define NAMAX   BLASFUNC(dmax)
+#else
+#define NAMAX   BLASFUNC(smax)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	NAMAX (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/min.c
+++ b/benchmark/min.c
@ -1,113 +1,113 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef NAMIN
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define NAMIN   BLASFUNC(dmin)
-#else
-#define NAMIN   BLASFUNC(smin)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	NAMIN (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef NAMIN
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define NAMIN   BLASFUNC(dmin)
+#else
+#define NAMIN   BLASFUNC(smin)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	NAMIN (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/benchmark/rotm.c
+++ b/benchmark/rotm.c
@ -1,138 +1,138 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
-THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef ROTM
-
-#ifdef DOUBLE
-#define ROTM BLASFUNC(drotm)
-#else
-#define ROTM BLASFUNC(srotm)
-#endif
-
-int main(int argc, char *argv[])
-{
-
-    FLOAT *x, *y;
-    // FLOAT result;
-    blasint m, i;
-    blasint inc_x = 1, inc_y = 1;
-    FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0};
-    int loops = 1;
-    int l;
-    char *p;
-
-    int from = 1;
-    int to = 200;
-    int step = 1;
-
-    
-    double time1, timeg;
-
-    argc--;
-    argv++;
-
-    if (argc > 0) {
-        from = atol(*argv);
-        argc--;
-        argv++;
-    }
-    if (argc > 0) {
-        to = MAX(atol(*argv), from);
-        argc--;
-        argv++;
-    }
-    if (argc > 0) {
-        step = atol(*argv);
-        argc--;
-        argv++;
-    }
-
-    if ((p = getenv("OPENBLAS_LOOPS")))
-        loops = atoi(p);
-    if ((p = getenv("OPENBLAS_INCX")))
-        inc_x = atoi(p);
-    if ((p = getenv("OPENBLAS_INCY")))
-        inc_y = atoi(p);
-
-    fprintf(
-        stderr,
-        "From : %3d  To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n",
-        from, to, step, inc_x, inc_y, loops);
-
-    if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) ==
-        NULL) {
-        fprintf(stderr, "Out of Memory!!\n");
-        exit(1);
-    }
-
-    if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) ==
-        NULL) {
-        fprintf(stderr, "Out of Memory!!\n");
-        exit(1);
-    }
-
-#ifdef __linux
-    srandom(getpid());
-#endif
-
-    fprintf(stderr, "   SIZE       Flops\n");
-
-    for (m = from; m <= to; m += step) {
-
-        timeg = 0;
-
-        fprintf(stderr, " %6d : ", (int)m);
-        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
-            x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-        }
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
-            y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-        }
-
-        for (l = 0; l < loops; l++) {
-            begin();
-
-            ROTM(&m, x, &inc_x, y, &inc_y, param);
-
-            end();
-
-            time1 = getsec();
-
-            timeg += time1;
-        }
-
-        timeg /= loops;
-
-        fprintf(stderr, " %10.2f MFlops %10.6f sec\n",
-                COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
-    }
-
-    return 0;
-}
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef ROTM
+
+#ifdef DOUBLE
+#define ROTM BLASFUNC(drotm)
+#else
+#define ROTM BLASFUNC(srotm)
+#endif
+
+int main(int argc, char *argv[])
+{
+
+    FLOAT *x, *y;
+    // FLOAT result;
+    blasint m, i;
+    blasint inc_x = 1, inc_y = 1;
+    FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0};
+    int loops = 1;
+    int l;
+    char *p;
+
+    int from = 1;
+    int to = 200;
+    int step = 1;
+
+    
+    double time1, timeg;
+
+    argc--;
+    argv++;
+
+    if (argc > 0) {
+        from = atol(*argv);
+        argc--;
+        argv++;
+    }
+    if (argc > 0) {
+        to = MAX(atol(*argv), from);
+        argc--;
+        argv++;
+    }
+    if (argc > 0) {
+        step = atol(*argv);
+        argc--;
+        argv++;
+    }
+
+    if ((p = getenv("OPENBLAS_LOOPS")))
+        loops = atoi(p);
+    if ((p = getenv("OPENBLAS_INCX")))
+        inc_x = atoi(p);
+    if ((p = getenv("OPENBLAS_INCY")))
+        inc_y = atoi(p);
+
+    fprintf(
+        stderr,
+        "From : %3d  To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n",
+        from, to, step, inc_x, inc_y, loops);
+
+    if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) ==
+        NULL) {
+        fprintf(stderr, "Out of Memory!!\n");
+        exit(1);
+    }
+
+    if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) ==
+        NULL) {
+        fprintf(stderr, "Out of Memory!!\n");
+        exit(1);
+    }
+
+#ifdef __linux
+    srandom(getpid());
+#endif
+
+    fprintf(stderr, "   SIZE       Flops\n");
+
+    for (m = from; m <= to; m += step) {
+
+        timeg = 0;
+
+        fprintf(stderr, " %6d : ", (int)m);
+        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
+            x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+        }
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
+            y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+        }
+
+        for (l = 0; l < loops; l++) {
+            begin();
+
+            ROTM(&m, x, &inc_x, y, &inc_y, param);
+
+            end();
+
+            time1 = getsec();
+
+            timeg += time1;
+        }
+
+        timeg /= loops;
+
+        fprintf(stderr, " %10.2f MFlops %10.6f sec\n",
+                COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
+    }
+
+    return 0;
+}
--- a/benchmark/spmv.c
+++ b/benchmark/spmv.c
@ -1,146 +1,146 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef SPMV
-
-#ifndef COMPLEX
-
-#ifdef DOUBLE
-#define SPMV   BLASFUNC(dspmv)
-#else
-#define SPMV   BLASFUNC(sspmv)
-#endif
-
-#else
-
-#ifdef DOUBLE
-#define SPMV   BLASFUNC(zspmv)
-#else
-#define SPMV   BLASFUNC(cspmv)
-#endif
-
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *a, *x, *y;
-  FLOAT alpha[] = {1.0, 1.0};
-  FLOAT beta [] = {1.0, 1.0};
-  char uplo='L';
-  blasint m, i, j;
-  blasint inc_x=1,inc_y=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
-  if ((p = getenv("OPENBLAS_UPLO")))  uplo=*p;
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
-
-  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
-
-   for(j = 0; j < m; j++){
-      		for(i = 0; i < m * COMPSIZE; i++){
-			a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-      		}
-   }
-
-
-    for (l=0; l<loops; l++)
-    {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
-			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-    	begin();
-
-    	SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops\n",
-	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
-
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef SPMV
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define SPMV   BLASFUNC(dspmv)
+#else
+#define SPMV   BLASFUNC(sspmv)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define SPMV   BLASFUNC(zspmv)
+#else
+#define SPMV   BLASFUNC(cspmv)
+#endif
+
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *a, *x, *y;
+  FLOAT alpha[] = {1.0, 1.0};
+  FLOAT beta [] = {1.0, 1.0};
+  char uplo='L';
+  blasint m, i, j;
+  blasint inc_x=1,inc_y=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+  if ((p = getenv("OPENBLAS_UPLO")))  uplo=*p;
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
+
+   for(j = 0; j < m; j++){
+      		for(i = 0; i < m * COMPSIZE; i++){
+			a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      		}
+   }
+
+
+    for (l=0; l<loops; l++)
+    {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
+			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+    	begin();
+
+    	SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops\n",
+	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
+
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
--- a/kernel/arm64/sgemm_ncopy_4.S
+++ b/kernel/arm64/sgemm_ncopy_4.S
@ -1,333 +1,333 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
-
-#define	M	x0
-#define	N	x1
-#define	A00	x2
-#define	LDA	x3
-#define	B00	x4
-
-#define	A01	x5
-#define	A02	x6
-#define	A03	x7
-#define	A04	x8
-
-#define I	x9
-#define	J	x10
-
-#define	TEMP1	x11
-#define	TEMP2	x12
-
-#define A_PREFETCH	2560
-
-/**************************************************************************************
-* Macro definitions
-**************************************************************************************/
-
-.macro SAVE_REGS
-	add	sp, sp, #-(11 * 16)
-	stp	d8, d9, [sp, #(0 * 16)]
-	stp	d10, d11, [sp, #(1 * 16)]
-	stp	d12, d13, [sp, #(2 * 16)]
-	stp	d14, d15, [sp, #(3 * 16)]
-	stp	d16, d17, [sp, #(4 * 16)]
-	stp	x18, x19, [sp, #(5 * 16)]
-	stp	x20, x21, [sp, #(6 * 16)]
-	stp	x22, x23, [sp, #(7 * 16)]
-	stp	x24, x25, [sp, #(8 * 16)]
-	stp	x26, x27, [sp, #(9 * 16)]
-	str	x28, [sp, #(10 * 16)]
-.endm
-
-.macro RESTORE_REGS
-	ldp	d8, d9, [sp, #(0 * 16)]
-	ldp	d10, d11, [sp, #(1 * 16)]
-	ldp	d12, d13, [sp, #(2 * 16)]
-	ldp	d14, d15, [sp, #(3 * 16)]
-	ldp	d16, d17, [sp, #(4 * 16)]
-	ldp	x18, x19, [sp, #(5 * 16)]
-	ldp	x20, x21, [sp, #(6 * 16)]
-	ldp	x22, x23, [sp, #(7 * 16)]
-	ldp	x24, x25, [sp, #(8 * 16)]
-	ldp	x26, x27, [sp, #(9 * 16)]
-	ldr	x28, [sp, #(10 * 16)]
-	add	sp, sp, #(11*16)
-.endm
-
-.macro COPY4x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	ins	v8.s[0], v0.s[0]
-	ins	v9.s[0], v0.s[1]
-	ins	v10.s[0], v0.s[2]
-	ins	v11.s[0], v0.s[3]
-
-	ldr	q1, [A02], #16
-	ins	v8.s[1], v1.s[0]
-	ins	v9.s[1], v1.s[1]
-	ins	v10.s[1], v1.s[2]
-	ins	v11.s[1], v1.s[3]
-
-	ldr	q2, [A03], #16
-	ins	v8.s[2], v2.s[0]
-	ins	v9.s[2], v2.s[1]
-	ins	v10.s[2], v2.s[2]
-	ins	v11.s[2], v2.s[3]
-
-	ldr	q3, [A04], #16
-	ins	v8.s[3], v3.s[0]
-	ins	v9.s[3], v3.s[1]
-	ins	v10.s[3], v3.s[2]
-	ins	v11.s[3], v3.s[3]
-
-	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
-	add	B00, B00, #64
-
-.endm
-
-.macro COPY1x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	ldr	s1, [A02], #4
-	ldr	s2, [A03], #4
-	ldr	s3, [A04], #4
-
-	stp	s0, s1, [B00]
-	add	B00, B00, #8
-   	stp	s2, s3, [B00]
-	add	B00, B00, #8
-.endm
-
-.macro COPY4x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	ins	v8.s[0], v0.s[0]
-	ins	v9.s[0], v0.s[1]
-	ins	v10.s[0], v0.s[2]
-	ins	v11.s[0], v0.s[3]
-
-	ldr	q1, [A02], #16
-	ins	v8.s[1], v1.s[0]
-	ins	v9.s[1], v1.s[1]
-	ins	v10.s[1], v1.s[2]
-	ins	v11.s[1], v1.s[3]
-
-	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
-	add	B00, B00, #32
-.endm
-
-
-.macro COPY1x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	ldr	s1, [A02], #4
-
-	stp	s0, s1, [B00]
-	add	B00, B00, #8
-.endm
-
-.macro COPY4x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	str	q0, [B00], #16
-.endm
-
-
-.macro COPY1x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	str	s0, [B00], #4
-.endm
-
-/**************************************************************************************
-* End of macro definitions
-**************************************************************************************/
-
-	PROLOGUE
-
-	.align 5
-
-	SAVE_REGS
-
-	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
-
-.Ldgemm_ncopy_L4_BEGIN:
-
-	asr	J, N, #2					// J = N / 4
-	cmp 	J, #0
-	ble	.Ldgemm_ncopy_L2_BEGIN
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_BEGIN:
-
-	mov	A01, A00
-	add	A02, A01, LDA
-	add	A03, A02, LDA
-	add	A04, A03, LDA
-	add	A00, A04, LDA
-
-	asr	I, M, #2					// I = M / 4
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L4_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_20:
-
-	COPY4x4
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L4_M4_20
-
-.Ldgemm_ncopy_L4_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L4_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_60:
-
-	COPY1x4
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L4_M4_60
-
-.Ldgemm_ncopy_L4_M4_END:
-
-	subs	J , J, #1						// j--
-	bne	.Ldgemm_ncopy_L4_M4_BEGIN
-
-/*********************************************************************************************/
-
-.Ldgemm_ncopy_L2_BEGIN:
-
-	tst	N, #3
-	ble	.Ldgemm_ncopy_L999
-
-	tst	N, #2
-	ble	.Ldgemm_ncopy_L1_BEGIN
-
-.Ldgemm_ncopy_L2_M4_BEGIN:
-	mov	A01, A00
-	add	A02, A01, LDA
-	add	A00, A02, LDA
-
-	asr	I, M, #2					// I = M / 4
-	cmp 	I, #0
-	ble	.Ldgemm_ncopy_L2_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L2_M4_20:
-
-	COPY4x2
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L2_M4_20
-
-.Ldgemm_ncopy_L2_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L2_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L2_M4_60:
-
-	COPY1x2
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L2_M4_60
-
-.Ldgemm_ncopy_L2_M4_END:
-
-
-/*********************************************************************************************/
-
-.Ldgemm_ncopy_L1_BEGIN:
-
-	tst	N, #1
-	ble	.Ldgemm_ncopy_L999
-
-.Ldgemm_ncopy_L1_M4_BEGIN:
-
-	mov	A01, A00
-
-	asr	I, M, #2					// I = M / 4
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L1_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L1_M4_20:
-
-	COPY4x1
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L1_M4_20
-
-
-.Ldgemm_ncopy_L1_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L1_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L1_M4_60:
-
-	COPY1x1
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L1_M4_60
-
-
-.Ldgemm_ncopy_L1_M4_END:
-
-.Ldgemm_ncopy_L999:
-
-	mov	x0, #0
-	RESTORE_REGS
-	ret
-
-	EPILOGUE
-
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	A00	x2
+#define	LDA	x3
+#define	B00	x4
+
+#define	A01	x5
+#define	A02	x6
+#define	A03	x7
+#define	A04	x8
+
+#define I	x9
+#define	J	x10
+
+#define	TEMP1	x11
+#define	TEMP2	x12
+
+#define A_PREFETCH	2560
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+.macro COPY4x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	ldr	q2, [A03], #16
+	ins	v8.s[2], v2.s[0]
+	ins	v9.s[2], v2.s[1]
+	ins	v10.s[2], v2.s[2]
+	ins	v11.s[2], v2.s[3]
+
+	ldr	q3, [A04], #16
+	ins	v8.s[3], v3.s[0]
+	ins	v9.s[3], v3.s[1]
+	ins	v10.s[3], v3.s[2]
+	ins	v11.s[3], v3.s[3]
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
+	add	B00, B00, #64
+
+.endm
+
+.macro COPY1x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+	ldr	s2, [A03], #4
+	ldr	s3, [A04], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+   	stp	s2, s3, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
+	add	B00, B00, #32
+.endm
+
+
+.macro COPY1x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	str	q0, [B00], #16
+.endm
+
+
+.macro COPY1x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	str	s0, [B00], #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+.Ldgemm_ncopy_L4_BEGIN:
+
+	asr	J, N, #2					// J = N / 4
+	cmp 	J, #0
+	ble	.Ldgemm_ncopy_L2_BEGIN
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_BEGIN:
+
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A00, A04, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_20:
+
+	COPY4x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_20
+
+.Ldgemm_ncopy_L4_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_60:
+
+	COPY1x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_60
+
+.Ldgemm_ncopy_L4_M4_END:
+
+	subs	J , J, #1						// j--
+	bne	.Ldgemm_ncopy_L4_M4_BEGIN
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L2_BEGIN:
+
+	tst	N, #3
+	ble	.Ldgemm_ncopy_L999
+
+	tst	N, #2
+	ble	.Ldgemm_ncopy_L1_BEGIN
+
+.Ldgemm_ncopy_L2_M4_BEGIN:
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A00, A02, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp 	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_20:
+
+	COPY4x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_20
+
+.Ldgemm_ncopy_L2_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_60:
+
+	COPY1x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_60
+
+.Ldgemm_ncopy_L2_M4_END:
+
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	.Ldgemm_ncopy_L999
+
+.Ldgemm_ncopy_L1_M4_BEGIN:
+
+	mov	A01, A00
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_20:
+
+	COPY4x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_20
+
+
+.Ldgemm_ncopy_L1_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_60:
+
+	COPY1x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_60
+
+
+.Ldgemm_ncopy_L1_M4_END:
+
+.Ldgemm_ncopy_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
--- a/kernel/power/cgemm_kernel_power9.S
+++ b/kernel/power/cgemm_kernel_power9.S
@ -1,293 +1,293 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@gmail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
-#define STACKSIZE  (512 )  
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
-
-#define A	r8
-#define	B	r9
-#define	C	r10
-#define	LDC	r6
-#define OFFSET	r7
-
-
-#define alpha_r vs19
-#define alpha_i vs20
-#define save_permute_1 vs21
-#define permute_mask vs22
-#define o0	0
- 
-
-#define T1	r11
-#define T2	r12
-#define T3	r14
-#define T4	r15
-#define T5	r16
-#define T6	r17
-#define L	r18
-#define T7	r19
-#define T8	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define T9	r27
-#define	T10	r28
-#define	PRE	r29
-
-#define T12	r30
-#define T13	r31
-
-#include "cgemm_macros_power9.S"
-
-.equ    perm_const1, 0x0405060700010203
-.equ    perm_const2, 0x0c0d0e0f08090a0b
-.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
-.equ save_permute_11, 0x0405060714151617
-
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-
-	addi	SP, SP, -STACKSIZE
-	mflr r0
-
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
-
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-  stxv    vs52,  288(SP)
-  stxv    vs53,  304(SP)
-  stxv    vs54,  320(SP)
-  stxv    vs55,  336(SP)
-  stxv    vs56,  352(SP)
-  stxv    vs57,  368(SP)
-  stxv    vs58,  384(SP)
-  stxv    vs59,  400(SP)
-  stxv    vs60,  416(SP)
-  stxv    vs61,  432(SP)
-  stxv    vs62,  448(SP)
-  stxv    vs63,  464(SP)
-  std     r0,   FLINK_SAVE(SP)
- 
-
-
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
-
-
-
-#ifdef TRMMKERNEL
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
-#endif
-   slwi    LDC, LDC, ZBASE_SHIFT
-
- 
- 
-	/*alpha is stored in f1. convert to single and splat*/
-    xscvdpspn alpha_r,vs1 
-    xscvdpspn alpha_i,vs2 
-	xxspltw   alpha_r,alpha_r,0 
-	xxspltw   alpha_i,alpha_i,0 
-/*load reverse permute mask for big endian
-  uint128 = 0xc0d0e0f08090a0b0405060700010203
-*/ 
-		
-	lis T2, perm_const2@highest
-	lis T1, perm_const1@highest
-	lis T3, save_permute_12@highest
-	lis T4, save_permute_11@highest
-
-	
-	ori T2, T2, perm_const2@higher
-	ori T1, T1, perm_const1@higher
-	ori T3, T3, save_permute_12@higher
-	ori T4, T4, save_permute_11@higher
-
-	
-	rldicr T2, T2, 32, 31
-	rldicr T1, T1, 32, 31
-	rldicr T3, T3, 32, 31
-	rldicr T4, T4, 32, 31 
-
-	oris T2, T2, perm_const2@h
-	oris T1, T1, perm_const1@h
-	oris T3, T3, save_permute_12@h
-	oris T4, T4, save_permute_11@h
-
-	
-	ori T2, T2, perm_const2@l  
-	ori T1, T1, perm_const1@l
-	ori T3, T3, save_permute_12@l  
-	ori T4, T4, save_permute_11@l
-
-	
-  li r0,0
-  li PRE,512
-
-#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
-/*negate for this case as we will use addition -1*(a+b) */
-  xvnegsp alpha_r,alpha_r
-  xvnegsp alpha_i,alpha_i
-#endif
-
-	mtvsrdd permute_mask,T2,T1
-	mtvsrdd save_permute_1,T3,T4 	
-
-     /*mask is reverse permute so we have to make it inner permute */
- 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
-
-#include "cgemm_logic_power9.S"
-
-.L999: 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+
+
+#define alpha_r vs19
+#define alpha_i vs20
+#define save_permute_1 vs21
+#define permute_mask vs22
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	PRE	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "cgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, ZBASE_SHIFT
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+    xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_i,vs2 
+	xxspltw   alpha_r,alpha_r,0 
+	xxspltw   alpha_i,alpha_i,0 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+
+	
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+
+	
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31 
+
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+
+	
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+
+	
+  li r0,0
+  li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegsp alpha_r,alpha_r
+  xvnegsp alpha_i,alpha_i
+#endif
+
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4 	
+
+     /*mask is reverse permute so we have to make it inner permute */
+ 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
+
+#include "cgemm_logic_power9.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
--- a/kernel/power/cgemm_logic_power9.S
+++ b/kernel/power/cgemm_logic_power9.S
--- a/kernel/power/cgemm_macros_power9.S
+++ b/kernel/power/cgemm_macros_power9.S
--- a/kernel/power/cgemv_n.c
+++ b/kernel/power/cgemv_n.c
--- a/kernel/power/cgemv_t.c
+++ b/kernel/power/cgemv_t.c
--- a/kernel/power/crot.c
+++ b/kernel/power/crot.c
@ -1,233 +1,233 @@
-/***************************************************************************
-Copyright (c) 2013-2018, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#include "common.h"
- 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
-#if defined(__VEC__) || defined(__ALTIVEC__)
-
-static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
-{
-  __vector float t0;
-  __vector float t1;
-  __vector float t2;
-  __vector float t3;
-  __vector float t4;
-  __vector float t5;
-  __vector float t6;
-  __vector float t7;
-  __asm__
-    (
-       "xscvdpspn   36, %x[cos]               \n\t" // load c to all words
-       "xxspltw     36, 36, 0                 \n\t" 
-       "xscvdpspn   37, %x[sin]               \n\t" // load s to all words
-       "xxspltw     37, 37, 0                 \n\t" 
-       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
-       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
-       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
-       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
-       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
-       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
-       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
-       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
-       "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
-       "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
-       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "ble         two%=                        \n\t" 
-       ".align    5                         \n\t" 
-       "one%=:                                    \n\t" 
-       "xvmulsp     40, 32, 36                \n\t" // c * x
-       "xvmulsp     41, 33, 36                \n\t" 
-       "xvmulsp     42, 34, 36                \n\t" 
-       "xvmulsp     43, 35, 36                \n\t" 
-       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
-       "xvmulsp     %x[x2], 49, 36            \n\t" 
-       "xvmulsp     %x[x1], 50, 36            \n\t" 
-       "xvmulsp     %x[x3], 51, 36            \n\t" 
-       "xvmulsp     44, 32, 37                \n\t" // s * x
-       "xvmulsp     45, 33, 37                \n\t" 
-       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
-       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
-       "xvmulsp     46, 34, 37                \n\t" 
-       "xvmulsp     47, 35, 37                \n\t" 
-       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
-       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
-       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
-       "xvmulsp     %x[x5], 49, 37            \n\t" 
-       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
-       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
-       "xvmulsp     %x[x6], 50, 37            \n\t" 
-       "xvmulsp     %x[x7], 51, 37            \n\t" 
-       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
-       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
-       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
-       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
-       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
-       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
-       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
-       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
-       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
-       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
-       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
-       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
-       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
-       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
-       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
-       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
-       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x3], %[i48], %[y_ptr]  \n\t" 
-       "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
-       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "bgt         one%=                        \n\t" 
-       "two%=:                                    \n\t" 
-       "xvmulsp     40, 32, 36                \n\t" // c * x
-       "xvmulsp     41, 33, 36                \n\t" 
-       "xvmulsp     42, 34, 36                \n\t" 
-       "xvmulsp     43, 35, 36                \n\t" 
-       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
-       "xvmulsp     %x[x2], 49, 36            \n\t" 
-       "xvmulsp     %x[x1], 50, 36            \n\t" 
-       "xvmulsp     %x[x3], 51, 36            \n\t" 
-       "xvmulsp     44, 32, 37                \n\t" // s * x
-       "xvmulsp     45, 33, 37                \n\t" 
-       "xvmulsp     46, 34, 37                \n\t" 
-       "xvmulsp     47, 35, 37                \n\t" 
-       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
-       "xvmulsp     %x[x5], 49, 37            \n\t" 
-       "xvmulsp     %x[x6], 50, 37            \n\t" 
-       "xvmulsp     %x[x7], 51, 37            \n\t" 
-       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
-       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
-       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
-       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
-       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
-       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
-       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
-       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
-       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
-       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
-       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
-       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
-       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
-       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
-       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x3], %[i48], %[y_ptr]  "
-     :
-       [mem_x]  "+m"  (*(float (*)[2*n])x),
-       [mem_y]  "+m"  (*(float (*)[2*n])y),
-       [temp_n] "+r"  (n),
-       [x_ptr]  "+&b" (x),
-       [y_ptr]  "+&b" (y),
-       [x0]     "=wa" (t0),
-       [x1]     "=wa" (t2),
-       [x2]     "=wa" (t1),
-       [x3]     "=wa" (t3),
-       [x4]     "=wa" (t4),
-       [x5]     "=wa" (t5),
-       [x6]     "=wa" (t6),
-       [x7]     "=wa" (t7)     
-     : 
-       [cos]    "f"   (c),
-       [sin]    "f"   (s),
-       [i16]    "b"   (16),
-       [i32]    "b"   (32),
-       [i48]    "b"   (48)     
-     :
-       "cr0",
-       "vs32","vs33","vs34","vs35","vs36","vs37",
-       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
-       "vs48","vs49","vs50","vs51"
-     );
-}
- 
-#endif
-#endif
-
-
-int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
-{
-   BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-    FLOAT temp[2];
-    BLASLONG inc_x2;
-    BLASLONG inc_y2;
-
-    if ( n <= 0     )  return(0); 
-
-    if ( (inc_x == 1) && (inc_y == 1) )
-    {
-#if defined(__VEC__) || defined(__ALTIVEC__)
-        BLASLONG n1 = n & -8; 
-        if ( n1 > 0 )
-        { 
-            crot_kernel_8(n1, x, y, c, s);
-            i=n1; 
-            ix=2*n1; 
-        }
-#endif
-         while(i < n)
-           {
-                temp[0]   = c*x[ix]   + s*y[ix] ;
-                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
-                y[ix]     = c*y[ix]   - s*x[ix] ;
-                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
-                x[ix]     = temp[0] ;
-                x[ix+1]   = temp[1] ;
-
-                ix += 2 ; 
-                i++ ;
-
-            }
-
-    }
-    else
-    {
-        inc_x2 = 2 * inc_x ;
-        inc_y2 = 2 * inc_y ;
-        while(i < n)
-        {
-            temp[0]   = c*x[ix]   + s*y[iy] ;
-            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
-            y[iy]     = c*y[iy]   - s*x[ix] ;
-            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
-            x[ix]     = temp[0] ;
-            x[ix+1]   = temp[1] ;
-
-            ix += inc_x2 ;
-            iy += inc_y2 ;
-            i++ ;
-
-        }
-    }
-	return(0);
-}
-
+/***************************************************************************
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#include "common.h"
+ 
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
+static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
+{
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+  __vector float t4;
+  __vector float t5;
+  __vector float t6;
+  __vector float t7;
+  __asm__
+    (
+       "xscvdpspn   36, %x[cos]               \n\t" // load c to all words
+       "xxspltw     36, 36, 0                 \n\t" 
+       "xscvdpspn   37, %x[sin]               \n\t" // load s to all words
+       "xxspltw     37, 37, 0                 \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "ble         two%=                        \n\t" 
+       ".align    5                         \n\t" 
+       "one%=:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "bgt         one%=                        \n\t" 
+       "two%=:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  "
+     :
+       [mem_x]  "+m"  (*(float (*)[2*n])x),
+       [mem_y]  "+m"  (*(float (*)[2*n])y),
+       [temp_n] "+r"  (n),
+       [x_ptr]  "+&b" (x),
+       [y_ptr]  "+&b" (y),
+       [x0]     "=wa" (t0),
+       [x1]     "=wa" (t2),
+       [x2]     "=wa" (t1),
+       [x3]     "=wa" (t3),
+       [x4]     "=wa" (t4),
+       [x5]     "=wa" (t5),
+       [x6]     "=wa" (t6),
+       [x7]     "=wa" (t7)     
+     : 
+       [cos]    "f"   (c),
+       [sin]    "f"   (s),
+       [i16]    "b"   (16),
+       [i32]    "b"   (32),
+       [i48]    "b"   (48)     
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
+ 
+#endif
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+   BLASLONG i=0;
+    BLASLONG ix=0,iy=0;
+    FLOAT temp[2];
+    BLASLONG inc_x2;
+    BLASLONG inc_y2;
+
+    if ( n <= 0     )  return(0); 
+
+    if ( (inc_x == 1) && (inc_y == 1) )
+    {
+#if defined(__VEC__) || defined(__ALTIVEC__)
+        BLASLONG n1 = n & -8; 
+        if ( n1 > 0 )
+        { 
+            crot_kernel_8(n1, x, y, c, s);
+            i=n1; 
+            ix=2*n1; 
+        }
+#endif
+         while(i < n)
+           {
+                temp[0]   = c*x[ix]   + s*y[ix] ;
+                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
+                y[ix]     = c*y[ix]   - s*x[ix] ;
+                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
+                x[ix]     = temp[0] ;
+                x[ix+1]   = temp[1] ;
+
+                ix += 2 ; 
+                i++ ;
+
+            }
+
+    }
+    else
+    {
+        inc_x2 = 2 * inc_x ;
+        inc_y2 = 2 * inc_y ;
+        while(i < n)
+        {
+            temp[0]   = c*x[ix]   + s*y[iy] ;
+            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+            y[iy]     = c*y[iy]   - s*x[ix] ;
+            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+            x[ix]     = temp[0] ;
+            x[ix+1]   = temp[1] ;
+
+            ix += inc_x2 ;
+            iy += inc_y2 ;
+            i++ ;
+
+        }
+    }
+	return(0);
+}
+
--- a/kernel/power/dgemm_kernel_power9.S
+++ b/kernel/power/dgemm_kernel_power9.S
@ -1,249 +1,249 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
- 
- 
-
- 
-#define STACKSIZE  (512 )
-#define ALPHA_SP   (296+192)(SP)
-#define FZERO	(304+192)(SP)
- 
-
- 
-#define	M	r3
-#define	N	r4
-#define	K	r5
- 
-#define A	r7
-#define	B	r8
-#define	C	r9
-#define	LDC	r10
-#define OFFSET	r6
- 
- 
-
-#define alpha_r vs18
-
-#define o0	0
-
-
-#define T4	r12
-#define T3	r11
-#define C4	r14
-#define o8	r15
-#define o24	r16
-#define C2	r17
-#define L	r18
-#define T1	r19
-#define C3	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define o16	r27
-#define	o32	r28
-#define	o48	r29
-
-#define PRE	r30
-#define T2	r31
-
-#include "dgemm_macros_power9.S"
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	addi	SP, SP, -STACKSIZE
-	li	r0, 0
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-    stxv    vs52,  288(SP)
-    stxv    vs53,  304(SP)
-    stxv    vs54,  320(SP)
-    stxv    vs55,  336(SP)
-    stxv    vs56,  352(SP)
-    stxv    vs57,  368(SP)
-    stxv    vs58,  384(SP) 
-    stxv    vs59,  400(SP) 
-    stxv    vs60,  416(SP)
-    stxv    vs61,  432(SP) 
-    stxv    vs62,  448(SP)
-    stxv    vs63,  464(SP)
-
-
-	stfd	f1,  ALPHA_SP
-	stw	r0,  FZERO 
-
-	slwi	LDC, LDC, BASE_SHIFT
-
-#if defined(TRMMKERNEL)
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
-#endif
-
-
-	cmpwi	cr0, M, 0
-	ble	.L999_H1
-	cmpwi	cr0, N, 0
-	ble	.L999_H1
-	cmpwi	cr0, K, 0
-	ble	.L999_H1
-
- 
- 
-   	addi	T1, SP, 296+192
- 
-
-	li	PRE, 384
-	li	o8 , 8
-	li	o16, 16
-	li	o24, 24
-	li	o32, 32
-	li	o48, 48
-
-
-	lxvdsx	alpha_r, 0, T1
-
-#include "dgemm_logic_power9.S"
-
-.L999:
-	addi	r3, 0, 0
-
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
- 
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP) 
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE
-	blr
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+ 
+ 
+
+ 
+#define STACKSIZE  (512 )
+#define ALPHA_SP   (296+192)(SP)
+#define FZERO	(304+192)(SP)
+ 
+
+ 
+#define	M	r3
+#define	N	r4
+#define	K	r5
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs18
+
+#define o0	0
+
+
+#define T4	r12
+#define T3	r11
+#define C4	r14
+#define o8	r15
+#define o24	r16
+#define C2	r17
+#define L	r18
+#define T1	r19
+#define C3	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_power9.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP) 
+    stxv    vs59,  400(SP) 
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP) 
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
+
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO 
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+
+ 
+ 
+   	addi	T1, SP, 296+192
+ 
+
+	li	PRE, 384
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+
+	lxvdsx	alpha_r, 0, T1
+
+#include "dgemm_logic_power9.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP) 
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
--- a/kernel/power/dgemm_logic_power9.S
+++ b/kernel/power/dgemm_logic_power9.S
--- a/kernel/power/dgemm_macros_power9.S
+++ b/kernel/power/dgemm_macros_power9.S
--- a/kernel/power/icamax.c
+++ b/kernel/power/icamax.c
@ -1,328 +1,328 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
-
-#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
-
-#if  !defined(USE_MASK_PERMUTATIONS)
-
-static inline __attribute__((always_inline))  __vector float mvec_mergee(__vector float a,__vector float b ){
-  __vector float result;
-  __asm__ ( 
-      "vmrgew %0,%1,%2;\n" 
-      : "=v" (result) 
-      : "v" (a), 
-      "v" (b) 
-      : );
-  return result;
-}
-
-static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
-  __vector float result;
-  __asm__ ( 
-      "vmrgow %0,%1,%2;\n" 
-      : "=v" (result) 
-      : "v" (a), 
-      "v" (b) 
-      : );
-  return result;
-}
-
-#endif
-
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
-
-    BLASLONG index;
-    BLASLONG i=0;
-#if  defined(USE_MASK_PERMUTATIONS)    
-    register __vector unsigned int static_index0 = {0,1,2,3};
-#else
-    register __vector unsigned int static_index0 = {2,0,3,1};
-#endif    
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0; 
-    register __vector unsigned int static_index2=static_index0 +temp1; 
-    register __vector unsigned int static_index3=static_index1 +temp1;  
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    register __vector float quadruple_values={0,0,0,0};
-
-    register __vector float * v_ptrx=(__vector float *)x;
-#if  defined(USE_MASK_PERMUTATIONS)    
-    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
-    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
-#endif    
-    for(; i<n; i+=32 ){
-       //absolute temporary complex vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-#if defined(USE_MASK_PERMUTATIONS)       
-       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
-       register __vector float ti=vec_perm(v0,v1,image_pack_mask); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
-       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-#else
-       register __vector float t1=mvec_mergee(v0,v1);
-       register __vector float ti=mvec_mergeo(v0,v1); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2= mvec_mergee(v2,v3);
-       register __vector float ti2=mvec_mergeo(v2,v3); 
-       v1=t2+ti2;
-       t1=mvec_mergee(v4,v5);
-       ti=mvec_mergeo(v4,v5);      
-       v2=t1+ti; //sum
-       t2=mvec_mergee(v6,v7);
-       ti2=mvec_mergeo(v6,v7); 
-       v3=t2+ti2;
-
-#endif
-       // now we have 16 summed elements . lets compare them
-       v_ptrx+=8;
-       register __vector bool int r1=vec_cmpgt(v1,v0);
-       register __vector bool int r2=vec_cmpgt(v3,v2);
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for first 16 values
-       r1=vec_cmpgt(v1,v0);
-       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1); 
-
-       //absolute temporary complex vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-#if defined(USE_MASK_PERMUTATIONS)       
-       t1=vec_perm(v0,v1,real_pack_mask);
-       ti=vec_perm(v0,v1,image_pack_mask); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=vec_perm(v2,v3,real_pack_mask);
-       ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-#else
-       t1=mvec_mergee(v0,v1);
-       ti=mvec_mergeo(v0,v1); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=mvec_mergee(v2,v3);
-       ti2=mvec_mergeo(v2,v3); 
-       v1=t2+ti2;
-       t1=mvec_mergee(v4,v5);
-       ti=mvec_mergeo(v4,v5);      
-       v2=t1+ti; //sum
-       t2=mvec_mergee(v6,v7);
-       ti2=mvec_mergeo(v6,v7); 
-       v3=t2+ti2;
-
-#endif
-       // now we have 16 summed elements {from 16 to 31} . lets compare them
-       v_ptrx+=8;
-       r1=vec_cmpgt(v1,v0);
-       r2=vec_cmpgt(v3,v2);
-       ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for the second 16 values
-       r1=vec_cmpgt(v1,v0);
-       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1); 
-       indv0+=temp1; //make index from 16->31
-
-       //find final quadruple from 32 elements
-       r2=vec_cmpgt(vv0,vf0);
-       ind2 = vec_sel( indf0,indv0,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-       //get asbolute index
-       ind2+=temp0;
-       //compare with old quadruple and update 
-       r1=vec_cmpgt(vv0,quadruple_values);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
-       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
-
-       temp0+=temp_add;     
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the maximum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-      index=i1>i2?i2:i1;
-    }else if(a2>a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4>a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-       index=i1>index?index:i1;
-       *maxf=a1; 
-    }else if(a3>a1){
-       index=i1;
-       *maxf=a3;
-    }else{ 
-        *maxf=a1;
-    }
-    return index; 
-
-}
- 
-  
-
- 
- 
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i = 0;
-    BLASLONG ix = 0;
-    FLOAT maxf = 0;
-    BLASLONG max = 0;
-    BLASLONG inc_x2;
-
-    if (n <= 0 || inc_x <= 0) return(max);
-     
-    if (inc_x == 1) {
-
-      BLASLONG n1 = n & -32;
-      if (n1 > 0) {
-
-            max = ciamax_kernel_32(n1, x, &maxf); 
-            i = n1;
-            ix = n1 << 1;
-      }
-
-      while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += 2;
-        i++;
-    }
-        return (max + 1);
-
-    } else {
- 
-      inc_x2 = 2 * inc_x;
-
-    maxf = CABS1(x,0);
-    ix += inc_x2;
-    i++;
-
-    while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += inc_x2;
-        i++;
-    }
-        return (max + 1);
-    }
- 
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
+
+#if  !defined(USE_MASK_PERMUTATIONS)
+
+static inline __attribute__((always_inline))  __vector float mvec_mergee(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgew %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgow %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
+
+    BLASLONG index;
+    BLASLONG i=0;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned int static_index0 = {0,1,2,3};
+#else
+    register __vector unsigned int static_index0 = {2,0,3,1};
+#endif    
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0; 
+    register __vector unsigned int static_index2=static_index0 +temp1; 
+    register __vector unsigned int static_index3=static_index1 +temp1;  
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+#endif    
+    for(; i<n; i+=32 ){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       register __vector float t1=mvec_mergee(v0,v1);
+       register __vector float ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2= mvec_mergee(v2,v3);
+       register __vector float ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       t1=mvec_mergee(v0,v1);
+       ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=mvec_mergee(v2,v3);
+       ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index; 
+
+}
+ 
+  
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0;
+    FLOAT maxf = 0;
+    BLASLONG max = 0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(max);
+     
+    if (inc_x == 1) {
+
+      BLASLONG n1 = n & -32;
+      if (n1 > 0) {
+
+            max = ciamax_kernel_32(n1, x, &maxf); 
+            i = n1;
+            ix = n1 << 1;
+      }
+
+      while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += 2;
+        i++;
+    }
+        return (max + 1);
+
+    } else {
+ 
+      inc_x2 = 2 * inc_x;
+
+    maxf = CABS1(x,0);
+    ix += inc_x2;
+    i++;
+
+    while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += inc_x2;
+        i++;
+    }
+        return (max + 1);
+    }
+ 
+}
+
+
--- a/kernel/power/icamin.c
+++ b/kernel/power/icamin.c
@ -1,266 +1,266 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
-
-
-
- 
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
-
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    float first_min=CABS1(x,0);
-    register __vector float quadruple_values={first_min,first_min,first_min,first_min};
-
-    register __vector float * v_ptrx=(__vector float *)x;
-    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
-    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
-    for(; i<n; i+=32){
-       //absolute temporary complex vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
-       register __vector float ti=vec_perm(v0,v1,image_pack_mask);      
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
-       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-       // now we have 16 summed elements . lets compare them
-       v_ptrx+=8;
-       register __vector bool int r1=vec_cmpgt(v0,v1);
-       register __vector bool int r2=vec_cmpgt(v2,v3);
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for first 16 values
-       r1=vec_cmpgt(v0,v1);
-       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1); 
-
-       //absolute temporary complex vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-       t1=vec_perm(v0,v1,real_pack_mask);
-       ti=vec_perm(v0,v1,image_pack_mask);      
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=vec_perm(v2,v3,real_pack_mask);
-       ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-       // now we have 16 summed elements {from 16 to 31} . lets compare them
-       v_ptrx+=8;
-       r1=vec_cmpgt(v0,v1);
-       r2=vec_cmpgt(v2,v3);
-       ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for the second 16 values
-       r1=vec_cmpgt(v0,v1);
-       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1); 
-       indv0+=temp1; //make index from 16->31
-
-       //find final quadruple from 32 elements
-       r2=vec_cmpgt(vf0,vv0);
-       ind2 = vec_sel( indf0,indv0,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-       //get asbolute index
-       ind2+=temp0;
-       //compare with old quadruple and update 
-       r1=vec_cmpgt(quadruple_values,vv0);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
-       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
-
-       temp0+=temp_add;     
-    }
-
- //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the minimum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-       index=i1>i2?i2:i1;
-    }else if(a2<a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4<a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-      index=i1>index?index:i1;
-       *minf=a1; 
-    }else if(a3<a1){
-       index=i1;
-       *minf=a3;
-    }else{ 
-        *minf=a1;
-    }
-    return index;
-
-}
- 
-  
-
- 
-
- 
- 
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0;
-    FLOAT minf;
-    BLASLONG min=0;
-    BLASLONG inc_x2;
-
-    if (n <= 0 || inc_x <= 0) return(min);
-    
-
-    if (inc_x == 1) {
-        minf = CABS1(x,0); //index will not be incremented
-        BLASLONG n1 = n & -32;
-        if (n1 > 0) {
-
-            min = ciamin_kernel_32(n1, x, &minf);
-            i = n1;
-            ix = n1 << 1;
-        }
-      
-
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += 2;
-            i++;
-        }
-        return (min + 1);
-
-    } else {
- 
-        inc_x2 = 2 * inc_x;
-
-        minf = CABS1(x,0);
-        ix += inc_x2;
-        i++;
-
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += inc_x2;
-            i++;
-        }
-        return (min + 1);
-    }
- 
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+
+
+ 
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
+
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    float first_min=CABS1(x,0);
+    register __vector float quadruple_values={first_min,first_min,first_min,first_min};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+    for(; i<n; i+=32){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+ //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+ 
+  
+
+ 
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i=0;
+    BLASLONG ix=0;
+    FLOAT minf;
+    BLASLONG min=0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(min);
+    
+
+    if (inc_x == 1) {
+        minf = CABS1(x,0); //index will not be incremented
+        BLASLONG n1 = n & -32;
+        if (n1 > 0) {
+
+            min = ciamin_kernel_32(n1, x, &minf);
+            i = n1;
+            ix = n1 << 1;
+        }
+      
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += 2;
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+ 
+        inc_x2 = 2 * inc_x;
+
+        minf = CABS1(x,0);
+        ix += inc_x2;
+        i++;
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += inc_x2;
+            i++;
+        }
+        return (min + 1);
+    }
+ 
+}
+
+
--- a/kernel/power/isamax.c
+++ b/kernel/power/isamax.c
@ -1,288 +1,288 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-
-
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 64 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    register __vector float quadruple_values={0,0,0,0};
-    register __vector float * v_ptrx=(__vector float *)x;
-    for(; i<n; i+=64){
-       //absolute temporary vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       register __vector bool int r1=vec_cmpgt(v1,v0);
-       register __vector bool int r2=vec_cmpgt(v3,v2);
-       register __vector bool int r3=vec_cmpgt(v5,v4);
-       register __vector bool int r4=vec_cmpgt(v7,v6);
-      
-       //select
-       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1);
-
-       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vf1= vec_sel(v2,v3,r2);
-
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-        r1=vec_cmpgt(vf1,vf0);
-       r2=vec_cmpgt(v1,v0);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_first= vec_sel(ind0_first,ind1,r1);
-       vf0= vec_sel(vf0,vf1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vf1= vec_sel(v0,v1,r2);
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the first 32 values
-       r1=vec_cmpgt(vf1,vf0);
-       ind0_first = vec_sel(ind0_first,ind2,r1);
-       vf0= vec_sel(vf0,vf1,r1);
- 
-       ind0_first+=temp0; //get absolute index
-
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       //second part of 32
-       // absolute temporary vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       r1=vec_cmpgt(v1,v0);
-       r2=vec_cmpgt(v3,v2);
-       r3=vec_cmpgt(v5,v4);
-       r4=vec_cmpgt(v7,v6);
-       //select
-       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1);
-
-       ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vv1= vec_sel(v2,v3,r2);
-
-       ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vv1,vv0);
-       r2=vec_cmpgt(v1,v0);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_second= vec_sel(ind0_second,ind1,r1);
-       vv0= vec_sel(vv0,vv1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vv1= vec_sel(v0,v1,r2) ;  
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the second 32 values
-       r1=vec_cmpgt(vv1,vv0);
-       ind0_second = vec_sel(ind0_second,ind2,r1);
-       vv0= vec_sel(vv0,vv1,r1);
-
-       ind0_second+=temp0; //get absolute index
-    
-       //find final quadruple from 64 elements
-       r2=vec_cmpgt(vv0,vf0);
-       ind2 = vec_sel( ind0_first,ind0_second,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-
-       //compare with old quadruple and update 
-       r3=vec_cmpgt(vv0,quadruple_values);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
-       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
-
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
- 
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the maximum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-      index=i1>i2?i2:i1;
-    }else if(a2>a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4>a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-       index=i1>index?index:i1;
-       *maxf=a1; 
-    }else if(a3>a1){
-       index=i1;
-       *maxf=a3;
-    }else{ 
-        *maxf=a1;
-    }
-    return index;
-
-}
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0;
-    FLOAT maxf = 0.0;
-    BLASLONG max = 0;
-
-    if (n <= 0 || inc_x <= 0) return (max);
-
-    if (inc_x == 1) {
-
-        BLASLONG n1 = n & -64;
-        if (n1 > 0) {
-
-            max = siamax_kernel_64(n1, x, &maxf);
-
-            i = n1;
-        }
-
-        while (i < n) {
-            if (ABS(x[i]) > maxf) {
-                max = i;
-                maxf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (max + 1);
-
-    } else {
-
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
-
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) > maxf) {
-                max = j + 1;
-                maxf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) > maxf) {
-                max = j + 2;
-                maxf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) > maxf) {
-                max = j + 3;
-                maxf = ABS(x[i + 3 * inc_x]);
-            }
-
-            i += inc_x * 4;
-
-            j += 4;
-
-        }
-
-
-        while (j < n) {
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (max + 1);
-    }
-}
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+
+
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+    register __vector float * v_ptrx=(__vector float *)x;
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector bool int r3=vec_cmpgt(v5,v4);
+       register __vector bool int r4=vec_cmpgt(v7,v6);
+      
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+        r1=vec_cmpgt(vf1,vf0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf1,vf0);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       r3=vec_cmpgt(v5,v4);
+       r4=vec_cmpgt(v7,v6);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv1,vv0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv1,vv0);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+    
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+
+       //compare with old quadruple and update 
+       r3=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+ 
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index;
+
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0;
+    FLOAT maxf = 0.0;
+    BLASLONG max = 0;
+
+    if (n <= 0 || inc_x <= 0) return (max);
+
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            max = siamax_kernel_64(n1, x, &maxf);
+
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) > maxf) {
+                max = i;
+                maxf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (max + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) > maxf) {
+                max = j + 1;
+                maxf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) > maxf) {
+                max = j + 2;
+                maxf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) > maxf) {
+                max = j + 3;
+                maxf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (max + 1);
+    }
+}
--- a/kernel/power/isamin.c
+++ b/kernel/power/isamin.c
@ -1,288 +1,288 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 64 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};
-    register __vector float * v_ptrx=(__vector float *)x;
-    register __vector float quadruple_values=vec_abs(v_ptrx[0]);
-    for(; i<n; i+=64){
-       //absolute temporary vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       register __vector bool int r1=vec_cmpgt(v0,v1);
-       register __vector bool int r2=vec_cmpgt(v2,v3);
-       register __vector bool int r3=vec_cmpgt(v4,v5);
-       register __vector bool int r4=vec_cmpgt(v6,v7);
-              
-       //select
-       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1);
-
-       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vf1= vec_sel(v2,v3,r2);
-
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vf0,vf1);
-       r2=vec_cmpgt(v0,v1);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_first= vec_sel(ind0_first,ind1,r1);
-       vf0= vec_sel(vf0,vf1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vf1= vec_sel(v0,v1,r2);
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the first 32 values
-       r1=vec_cmpgt(vf0,vf1);
-       ind0_first = vec_sel(ind0_first,ind2,r1);
-       vf0= vec_sel(vf0,vf1,r1);
- 
-       ind0_first+=temp0; //get absolute index
-       
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       //second part of 32
-       // absolute temporary vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       r1=vec_cmpgt(v0,v1);
-       r2=vec_cmpgt(v2,v3);
-       r3=vec_cmpgt(v4,v5);
-       r4=vec_cmpgt(v6,v7);
-       //select
-       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1);
-
-       ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vv1= vec_sel(v2,v3,r2);
-
-       ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vv0,vv1);
-       r2=vec_cmpgt(v0,v1);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_second= vec_sel(ind0_second,ind1,r1);
-       vv0= vec_sel(vv0,vv1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vv1= vec_sel(v0,v1,r2) ;  
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the second 32 values
-       r1=vec_cmpgt(vv0,vv1);
-       ind0_second = vec_sel(ind0_second,ind2,r1);
-       vv0= vec_sel(vv0,vv1,r1);
-
-       ind0_second+=temp0; //get absolute index
-        
-       //find final quadruple from 64 elements
-       r2=vec_cmpgt(vf0,vv0);
-       ind2 = vec_sel( ind0_first,ind0_second,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-             
-       //compare with old quadruple and update 
-       r3=vec_cmpgt( quadruple_values,vv0);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
-       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
-            
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       
-      
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the minimum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-       index=i1>i2?i2:i1;
-    }else if(a2<a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4<a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-      index=i1>index?index:i1;
-       *minf=a1; 
-    }else if(a3<a1){
-       index=i1;
-       *minf=a3;
-    }else{ 
-        *minf=a1;
-    }
-    return index;
-
-}
-
-
-
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0; 
-    BLASLONG min = 0;
-    FLOAT minf = 0.0;
-    
-    if (n <= 0 || inc_x <= 0) return (min);
-    minf = ABS(x[0]); //index's not incremented
-    if (inc_x == 1) {
-
-        BLASLONG n1 = n & -64;
-        if (n1 > 0) {
-
-            min = siamin_kernel_64(n1, x, &minf);
-            i = n1;
-        }
-
-        while (i < n) {
-            if (ABS(x[i]) < minf) {
-                min = i;
-                minf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (min + 1);
-
-    } else {
-
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
-
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) < minf) {
-                min = j + 1;
-                minf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) < minf) {
-                min = j + 2;
-                minf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) < minf) {
-                min = j + 3;
-                minf = ABS(x[i + 3 * inc_x]);
-            }
-
-            i += inc_x * 4;
-
-            j += 4;
-
-        }
-
-
-        while (j < n) {
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (min + 1);
-    }
-}
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector float quadruple_values=vec_abs(v_ptrx[0]);
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector bool int r3=vec_cmpgt(v4,v5);
+       register __vector bool int r4=vec_cmpgt(v6,v7);
+              
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vf0,vf1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf0,vf1);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+       
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       r3=vec_cmpgt(v4,v5);
+       r4=vec_cmpgt(v6,v7);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv0,vv1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv0,vv1);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+        
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+             
+       //compare with old quadruple and update 
+       r3=vec_cmpgt( quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+            
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       
+      
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0; 
+    BLASLONG min = 0;
+    FLOAT minf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (min);
+    minf = ABS(x[0]); //index's not incremented
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            min = siamin_kernel_64(n1, x, &minf);
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) < minf) {
+                min = i;
+                minf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) < minf) {
+                min = j + 1;
+                minf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) < minf) {
+                min = j + 2;
+                minf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) < minf) {
+                min = j + 3;
+                minf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (min + 1);
+    }
+}
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@ -1,272 +1,272 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
-#define STACKSIZE  (512 )  
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
- 
-#define A	r7
-#define	B	r8
-#define	C	r9
-#define	LDC	r10
-#define OFFSET	r6
- 
- 
-
-#define alpha_r vs20
-#define save_permute_1 vs21
-#define save_permute_2 vs22
-#define permute_mask vs23
-#define o0	0
- 
-
-#define T1	r11
-#define T2	r12
-#define T3	r14
-#define T4	r15
-#define T5	r16
-#define T6	r17
-#define L	r18
-#define T7	r19
-#define T8	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define T9	r27
-#define	T10	r28
-#define	T11	r29
-
-#define T12	r30
-#define T13	r31
-
-#include "sgemm_macros_power9.S"
-
-.equ    perm_const1, 0x0405060700010203
-.equ    perm_const2, 0x0c0d0e0f08090a0b
-.equ save_permute_11, 0x1415161718191a1b
-.equ save_permute_12, 0x0405060708090a0b
-.equ save_permute_21, 0x101112131c1d1e1f
-.equ save_permute_22, 0x000102030c0d0e0f 
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	addi	SP, SP, -STACKSIZE
-	mflr r0
-
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-  stxv    vs52,  288(SP)
-  stxv    vs53,  304(SP)
-  stxv    vs54,  320(SP)
-  stxv    vs55,  336(SP)
-  stxv    vs56,  352(SP)
-  stxv    vs57,  368(SP)
-  stxv    vs58,  384(SP)
-  stxv    vs59,  400(SP)
-  stxv    vs60,  416(SP)
-  stxv    vs61,  432(SP)
-  stxv    vs62,  448(SP)
-  stxv    vs63,  464(SP)
-  std     r0,   FLINK_SAVE(SP)
- 
-
-#if defined(TRMMKERNEL) 
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
-#endif
-   slwi    LDC, LDC, 2
-
- 
- 
-	/*alpha is stored in f1. convert to single and splat*/
-  xscvdpspn alpha_r,vs1 
-	xxspltw   alpha_r,alpha_r,0 
- 
-/*load reverse permute mask for big endian
-  uint128 = 0xc0d0e0f08090a0b0405060700010203
-*/ 
-		
-	lis T2, perm_const2@highest
-	lis T1, perm_const1@highest
-	lis T3, save_permute_12@highest
-	lis T4, save_permute_11@highest
-	lis T5, save_permute_22@highest
-	lis T6, save_permute_21@highest
-	ori T2, T2, perm_const2@higher
-	ori T1, T1, perm_const1@higher
-	ori T3, T3, save_permute_12@higher
-	ori T4, T4, save_permute_11@higher
-	ori T5, T5, save_permute_22@higher
-	ori T6, T6, save_permute_21@higher
-	rldicr T2, T2, 32, 31
-	rldicr T1, T1, 32, 31
-	rldicr T3, T3, 32, 31
-	rldicr T4, T4, 32, 31
-	rldicr T5, T5, 32, 31
-	rldicr T6, T6, 32, 31
-	oris T2, T2, perm_const2@h
-	oris T1, T1, perm_const1@h
-	oris T3, T3, save_permute_12@h
-	oris T4, T4, save_permute_11@h
-	oris T5, T5, save_permute_22@h
-	oris T6, T6, save_permute_21@h
-	ori T2, T2, perm_const2@l  
-	ori T1, T1, perm_const1@l
-	ori T3, T3, save_permute_12@l  
-	ori T4, T4, save_permute_11@l
-	ori T5, T5, save_permute_22@l 
-	ori T6, T6, save_permute_21@l
-  li r0,0
-	mtvsrdd permute_mask,T2,T1
-	mtvsrdd save_permute_1,T3,T4	
-	mtvsrdd save_permute_2,T5,T6	
-
-#include "sgemm_logic_power9.S"
-
-.L999: 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	T11	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "sgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f 
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+#if defined(TRMMKERNEL) 
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, 2
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+  xscvdpspn alpha_r,vs1 
+	xxspltw   alpha_r,alpha_r,0 
+ 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+	lis T5, save_permute_22@highest
+	lis T6, save_permute_21@highest
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+	ori T5, T5, save_permute_22@higher
+	ori T6, T6, save_permute_21@higher
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31
+	rldicr T5, T5, 32, 31
+	rldicr T6, T6, 32, 31
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+	oris T5, T5, save_permute_22@h
+	oris T6, T6, save_permute_21@h
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+	ori T5, T5, save_permute_22@l 
+	ori T6, T6, save_permute_21@l
+  li r0,0
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4	
+	mtvsrdd save_permute_2,T5,T6	
+
+#include "sgemm_logic_power9.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
--- a/kernel/power/sgemv_n.c
+++ b/kernel/power/sgemv_n.c
@ -1,470 +1,470 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/gemv_n.c"
-
-#else
-
-#include "common.h"
-
-#define NBMAX 4096
-
-static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
-{
-
-    BLASLONG i;
-	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
-    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
-	a0 = ap[0];
-	a1 = ap[1];
-	a2 = ap[2];
-	a3 = ap[3]; 
-    b0 = a0 + lda4 ;
-	b1 = a1 + lda4 ;
-	b2 = a2 + lda4 ;
-	b3 = a3 + lda4 ;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    x4 = xo[4] * *alpha;
-    x5 = xo[5] * *alpha;
-    x6 = xo[6] * *alpha;
-    x7 = xo[7] * *alpha;
-    __vector float* va0 = (__vector float*)a0;
-    __vector float* va1 = (__vector float*)a1;
-    __vector float* va2 = (__vector float*)a2;
-    __vector float* va3 = (__vector float*)a3;
-    __vector float* vb0 = (__vector float*)b0;
-    __vector float* vb1 = (__vector float*)b1;
-    __vector float* vb2 = (__vector float*)b2;
-    __vector float* vb3 = (__vector float*)b3; 
-    
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float   v_x4 = {x4,x4,x4,x4};
-    __vector float   v_x5 = {x5,x5,x5,x5};
-    __vector float   v_x6 = {x6,x6,x6,x6};
-    __vector float   v_x7 = {x7,x7,x7,x7};
-    __vector float* v_y =(__vector float*)y;   
- 
-    for ( i=0; i< n/4; i++)
-    {
-        register __vector float vy=v_y[i];
-        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ; 
-        vy  += v_x4 * vb0[i]   +  v_x5 * vb1[i]   + v_x6 * vb2[i]   + v_x7 * vb3[i] ;
-        v_y[i] =vy;  
-    }
-
-}
-	 
-static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT x0,x1,x2,x3;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1];
-    __vector float* va2 = (__vector float*)ap[2];
-    __vector float* va3 = (__vector float*)ap[3]; 
- 
-    for ( i=0; i< n/4; i++ )
-    {
-        register __vector float vy=v_y[i];
-        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;  
-        v_y[i] =vy;     
-    }
-
-} 
-
-static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0,x1;
-    x0 = x[0] * *alpha;
-    x1 = x[1] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1]; 
- 
-    for ( i=0; i< n/4; i++ )
-    { 
-        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;     
-    }
-
-} 
- 
- 
-static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0 ;
-    x0 = x[0] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap; 
- 
-    for ( i=0; i< n/4; i++ )
-    { 
-        v_y[i]   += v_x0 * va0[i]  ;        
-    }
-
-}
- 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-    BLASLONG i;
-        
-    for ( i=0; i<n; i++ ){
-            *dest += *src;
-            src++;
-            dest += inc_dest;
-    }
-    return;
-     
-
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	FLOAT *ap[4];
-	BLASLONG n1;
-	BLASLONG m1;
-	BLASLONG m2;
-	BLASLONG m3;
-	BLASLONG n2;
-	BLASLONG lda4 =  lda << 2;
-	BLASLONG lda8 =  lda << 3;
-	FLOAT xbuffer[8] __attribute__((aligned(16)));
-	FLOAT *ybuffer;
-
-        if ( m < 1 ) return(0);
-        if ( n < 1 ) return(0);
-
-	ybuffer = buffer;
-	
-        if ( inc_x == 1 )
-	{
-		n1 = n >> 3 ;
-		n2 = n &  7 ;
-	}
-	else
-	{
-		n1 = n >> 2 ;
-		n2 = n &  3 ;
-
-	}
-	
-        m3 = m & 3  ;
-        m1 = m & -4 ;
-        m2 = (m & (NBMAX-1)) - m3 ;
-
-
-	y_ptr = y;
-
-	BLASLONG NB = NBMAX;
-
-	while ( NB == NBMAX )
-	{
-		
-		m1 -= NB;
-		if ( m1 < 0)
-		{
-			if ( m2 == 0 ) break;	
-			NB = m2;
-		}
-		
-		a_ptr = a;
-		x_ptr = x;
-		
-		ap[0] = a_ptr;
-		ap[1] = a_ptr + lda;
-		ap[2] = ap[1] + lda;
-		ap[3] = ap[2] + lda;
-
-		if ( inc_y != 1 )
-			memset(ybuffer,0,NB*4);
-		else
-			ybuffer = y_ptr;
-
-		if ( inc_x == 1 )
-		{
-
-
-			for( i = 0; i < n1 ; i++)
-			{
-				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
-				ap[0] += lda8; 
-				ap[1] += lda8; 
-				ap[2] += lda8; 
-				ap[3] += lda8; 
-				a_ptr += lda8;
-				x_ptr += 8;	
-			}
-
-
-			if ( n2 & 4 )
-			{
-				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-				x_ptr += 4;	
-			}
-
-			if ( n2 & 2 )
-			{
-				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
-				a_ptr += lda*2;
-				x_ptr += 2;	
-			}
-
-
-			if ( n2 & 1 )
-			{
-				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
-                a_ptr += lda;
-                x_ptr += 1;   
-			}
-
-
-		}
-		else
-		{
-
-			for( i = 0; i < n1 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[1] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[2] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[3] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-			}
-
-			for( i = 0; i < n2 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
-				a_ptr += lda;
-
-			}
-
-		}
-
-		a     += NB;
-		if ( inc_y != 1 )
-		{
-			add_y(NB,ybuffer,y_ptr,inc_y);
-			y_ptr += NB * inc_y;
-		}
-		else
-			y_ptr += NB ;
-
-	}
-
-	if ( m3 == 0 ) return(0);
-
-	if ( m3 == 3 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		FLOAT temp2 = 0.0;
-		if ( lda == 3 && inc_x ==1 )
-		{
-
-			for( i = 0; i < ( n & -4 ); i+=4 )
-			{
-
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
-				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-
-				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
-				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
-				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
-
-				a_ptr += 12;
-				x_ptr += 4;
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				a_ptr += 3;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp2;
-		return(0);
-	}
-
-
-	if ( m3 == 2 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		if ( lda == 2 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4) ; i+=4 )
-			{
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
-				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-				a_ptr += 8;
-				x_ptr += 4;
-
-			}
-
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0]   * x_ptr[0];
-				temp1 += a_ptr[1]   * x_ptr[0];
-				a_ptr += 2;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		return(0);
-	}
-
-	if ( m3 == 1 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp = 0.0;
-		if ( lda == 1 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4); i+=4 )
-			{
-				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
-	
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp += a_ptr[i] * x_ptr[i];
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp += a_ptr[0] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-			}
-
-		}
-		y_ptr[0] += alpha * temp;
-		return(0);
-	}
-
-
-	return(0);
-}
-
-#endif
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_n.c"
+
+#else
+
+#include "common.h"
+
+#define NBMAX 4096
+
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+    BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
+    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3]; 
+    b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    x4 = xo[4] * *alpha;
+    x5 = xo[5] * *alpha;
+    x6 = xo[6] * *alpha;
+    x7 = xo[7] * *alpha;
+    __vector float* va0 = (__vector float*)a0;
+    __vector float* va1 = (__vector float*)a1;
+    __vector float* va2 = (__vector float*)a2;
+    __vector float* va3 = (__vector float*)a3;
+    __vector float* vb0 = (__vector float*)b0;
+    __vector float* vb1 = (__vector float*)b1;
+    __vector float* vb2 = (__vector float*)b2;
+    __vector float* vb3 = (__vector float*)b3; 
+    
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float   v_x4 = {x4,x4,x4,x4};
+    __vector float   v_x5 = {x5,x5,x5,x5};
+    __vector float   v_x6 = {x6,x6,x6,x6};
+    __vector float   v_x7 = {x7,x7,x7,x7};
+    __vector float* v_y =(__vector float*)y;   
+ 
+    for ( i=0; i< n/4; i++)
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ; 
+        vy  += v_x4 * vb0[i]   +  v_x5 * vb1[i]   + v_x6 * vb2[i]   + v_x7 * vb3[i] ;
+        v_y[i] =vy;  
+    }
+
+}
+	 
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+    BLASLONG i;
+    FLOAT x0,x1,x2,x3;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1];
+    __vector float* va2 = (__vector float*)ap[2];
+    __vector float* va3 = (__vector float*)ap[3]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;  
+        v_y[i] =vy;     
+    }
+
+} 
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0,x1;
+    x0 = x[0] * *alpha;
+    x1 = x[1] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;     
+    }
+
+} 
+ 
+ 
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0 ;
+    x0 = x[0] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]  ;        
+    }
+
+}
+ 
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+    BLASLONG i;
+        
+    for ( i=0; i<n; i++ ){
+            *dest += *src;
+            src++;
+            dest += inc_dest;
+    }
+    return;
+     
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8] __attribute__((aligned(16)));
+	FLOAT *ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
+                a_ptr += lda;
+                x_ptr += 1;   
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+#endif
+
--- a/kernel/power/sgemv_n_8.c
+++ b/kernel/power/sgemv_n_8.c
--- a/kernel/power/sgemv_t.c
+++ b/kernel/power/sgemv_t.c
@ -1,484 +1,484 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/gemv_t.c"
-
-#else
-
-#include "common.h"
-
-#define NBMAX 2048
-
-#include <altivec.h> 
- 
-static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i;  
-    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
-    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0};
-    register __vector float temp4 = {0,0,0,0};
-    register __vector float temp5 = {0,0,0,0};
-    register __vector float temp6 = {0,0,0,0};
-    register __vector float temp7 = {0,0,0,0};
-
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    a4 = a3 + lda;
-    a5 = a4 + lda;
-    a6 = a5 + lda;
-    a7 = a6 + lda;
-    va0 = (__vector float*) a0;
-    va1 = (__vector float*) a1;
-    va2 = (__vector float*) a2;
-    va3 = (__vector float*) a3;
-    va4 = (__vector float*) a4;
-    va5 = (__vector float*) a5;
-    va6 = (__vector float*) a6;
-    va7 = (__vector float*) a7;
-    v_x = (__vector float*) x;
- 
-   
-        for (i = 0; i < n/4; i ++) {
-            temp0 += v_x[i] * va0[i];
-            temp1 += v_x[i] * va1[i];
-            temp2 += v_x[i] * va2[i];
-            temp3 += v_x[i] * va3[i];
-            temp4 += v_x[i] * va4[i];
-            temp5 += v_x[i] * va5[i];
-            temp6 += v_x[i] * va6[i];
-            temp7 += v_x[i] * va7[i]; 
-        }
-    
-  
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
-    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
-    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
-    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
-
-}
- 
-
-static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i = 0;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* va2 = (__vector float*) a2;
-    __vector float* va3 = (__vector float*) a3;
-    __vector float* v_x = (__vector float*) x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0}; 
-
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i];
-        temp1 += v_x[i] * va1[i];
-        temp2 += v_x[i] * va2[i];
-        temp3 += v_x[i] * va3[i]; 
-    }
- 
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-}
- 
-
-static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
-
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    __vector float temp1 = {0,0,0,0};
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i];
-        temp1 += v_x[i] * va1[i];
-    }
-
-
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
-}
-
-static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i] ;
-    }
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-
-}
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest++ = *src;
-        src += inc_src;
-    }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i;
-    BLASLONG j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2; 
-    FLOAT ybuffer[8] __attribute__((aligned(16)));
-    FLOAT *xbuffer; 
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    xbuffer = buffer;
-
-    n1 = n >> 3;
-    n2 = n & 7;
-
-    m3 = m & 3;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 1)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        BLASLONG lda8 = lda << 3;
-
-
-        if (inc_y == 1) {
-
-            for (i = 0; i < n1; i++) {
-                 
-                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
- 
-                y_ptr += 8;
-                a_ptr += lda8;
-        
-            }
-
-        } else {
-                   
-            for (i = 0; i < n1; i++) {
-                ybuffer[0] = 0;
-                ybuffer[1] = 0;
-                ybuffer[2] = 0;
-                ybuffer[3] = 0;
-                ybuffer[4] = 0;
-                ybuffer[5] = 0;
-                ybuffer[6] = 0;
-                ybuffer[7] = 0;
-                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
- 
-
-                *y_ptr += ybuffer[0];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[1];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[2];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[3];
-                y_ptr += inc_y;
-
-                *y_ptr += ybuffer[4];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[5];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[6];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[7];
-                y_ptr += inc_y;
-
-                a_ptr += lda8;
-            }
-
-        }
-
-
-        if (n2 & 4) {
-            ybuffer[0] = 0;
-            ybuffer[1] = 0;
-            ybuffer[2] = 0;
-            ybuffer[3] = 0;
-            sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
-            a_ptr += lda<<2;
-
-            *y_ptr += ybuffer[0];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[1];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[2];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[3];
-            y_ptr += inc_y;
-        }
-
-        if (n2 & 2) {
-            sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
-            a_ptr += lda << 1;
-            y_ptr += 2 * inc_y;
-
-        }
-
-        if (n2 & 1) {
-            sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
-            a_ptr += lda;
-            y_ptr += inc_y;
-
-        }
-
-        a += NB;
-        x += NB * inc_x;
-
-
-    }
-
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    a_ptr = a;
-    if (m3 == 3) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp2 = *x_ptr * alpha;
-
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 3 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
-                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
-                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
-                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
-                aj += 12;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
-                aj += 3;
-            }
-
-        } else {
-
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    aj += lda;
-                }
-
-            } else {
-
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-
-            }
-
-        }
-        return (0);
-    }
-
-    if (m3 == 2) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 2 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
-                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
-                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
-                aj += 8;
-
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                aj += 2;
-            }
-
-        } else {
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    aj += lda;
-                }
-
-            } else {
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-            }
-
-        }
-        return (0);
-
-    }
-
-    FLOAT xtemp = *x_ptr * alpha;
-    FLOAT *aj = a_ptr;
-    y_ptr = y;
-    if (lda == 1 && inc_y == 1) {
-        for (j = 0; j < (n & -4); j += 4) {
-            y_ptr[j] += aj[j] * xtemp;
-            y_ptr[j + 1] += aj[j + 1] * xtemp;
-            y_ptr[j + 2] += aj[j + 2] * xtemp;
-            y_ptr[j + 3] += aj[j + 3] * xtemp;
-        }
-        for (; j < n; j++) {
-            y_ptr[j] += aj[j] * xtemp;
-        }
-
-
-    } else {
-        if (inc_y == 1) {
-
-            BLASLONG register lda2 = lda << 1;
-            BLASLONG register lda4 = lda << 2;
-            BLASLONG register lda3 = lda2 + lda;
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += *aj * xtemp;
-                y_ptr[j + 1] += *(aj + lda) * xtemp;
-                y_ptr[j + 2] += *(aj + lda2) * xtemp;
-                y_ptr[j + 3] += *(aj + lda3) * xtemp;
-                aj += lda4;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += *aj * xtemp;
-                aj += lda;
-            }
-
-        } else {
-            for (j = 0; j < n; j++) {
-                *y_ptr += *aj * xtemp;
-                y_ptr += inc_y;
-                aj += lda;
-            }
-
-        }
-    }
-
-    return (0);
-
-}
-
-#endif
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_t.c"
+
+#else
+
+#include "common.h"
+
+#define NBMAX 2048
+
+#include <altivec.h> 
+ 
+static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;  
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0};
+    register __vector float temp4 = {0,0,0,0};
+    register __vector float temp5 = {0,0,0,0};
+    register __vector float temp6 = {0,0,0,0};
+    register __vector float temp7 = {0,0,0,0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector float*) a0;
+    va1 = (__vector float*) a1;
+    va2 = (__vector float*) a2;
+    va3 = (__vector float*) a3;
+    va4 = (__vector float*) a4;
+    va5 = (__vector float*) a5;
+    va6 = (__vector float*) a6;
+    va7 = (__vector float*) a7;
+    v_x = (__vector float*) x;
+ 
+   
+        for (i = 0; i < n/4; i ++) {
+            temp0 += v_x[i] * va0[i];
+            temp1 += v_x[i] * va1[i];
+            temp2 += v_x[i] * va2[i];
+            temp3 += v_x[i] * va3[i];
+            temp4 += v_x[i] * va4[i];
+            temp5 += v_x[i] * va5[i];
+            temp6 += v_x[i] * va6[i];
+            temp7 += v_x[i] * va7[i]; 
+        }
+    
+  
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
+    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
+    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
+    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0}; 
+
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+        temp2 += v_x[i] * va2[i];
+        temp3 += v_x[i] * va3[i]; 
+    }
+ 
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    __vector float temp1 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
+}
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i] ;
+    }
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2; 
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer; 
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+        
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 == 3) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 3 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+                aj += 12;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                aj += 3;
+            }
+
+        } else {
+
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    aj += lda;
+                }
+
+            } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            }
+
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        }
+        return (0);
+
+    }
+
+    FLOAT xtemp = *x_ptr * alpha;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    }
+
+    return (0);
+
+}
+
+#endif
--- a/kernel/power/sgemv_t_8.c
+++ b/kernel/power/sgemv_t_8.c
--- a/kernel/power/zgemm_kernel_power9.S
+++ b/kernel/power/zgemm_kernel_power9.S
@ -1,245 +1,245 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
-#define LOAD	ld
- 
-#define STACKSIZE 512
-
-#define FZERO	312+192(SP)
-
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
-
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
- 
-#define A	r8
-#define	B	r9
-#define	C	r10
-#define	LDC	r6
-#define OFFSET	r7
- 
- 
-
-#define o0	0
-#define alpha_r vs30
-#define alpha_i vs31
-
-#define VECSAVE r11
-
-#define FRAMEPOINTER r12
-
-#define T10 r14
-
-#define L	r15
-#define T8	r16
-#define T5	r17
-#define T2	r19
-#define TEMP_REG	r20
-#define	T6	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO	r26
-#define T7	r27
-#define	T3	r28
-#define T4	r29
-
-#define PRE	r30
-#define T1  	r31
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	mr      FRAMEPOINTER, SP
-    addi    SP, SP, -STACKSIZE 
-    mflr    r0
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
-    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
-    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-    stxv    vs52,  288(SP)
-    stxv    vs53,  304(SP)
-    stxv    vs54,  320(SP)
-    stxv    vs55,  336(SP)
-    stxv    vs56,  352(SP)
-    stxv    vs57,  368(SP)
-    stxv    vs58,  384(SP)
-    stxv    vs59,  400(SP)
-    stxv    vs60,  416(SP)
-    stxv    vs61,  432(SP)
-    stxv    vs62,  448(SP)
-    stxv    vs63,  464(SP)
-
-    std    r0, FLINK_SAVE(SP)
- 
-
-#if defined(linux) || defined(__FreeBSD__)
-	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
-#endif
-
-
-#ifdef TRMMKERNEL
-#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
-	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
-#endif 
-#endif
-
-
-#include "zgemm_macros_power9.S"
-
- 
-
-	slwi	LDC, LDC, ZBASE_SHIFT
-	li	PRE,  512 
-    li  r0,   0
- 
-
-#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
-/*negate for this case as we will use addition -1*(a+b) */
-  xvnegdp alpha_r,alpha_r
-  xvnegdp alpha_i,alpha_i
-#endif
-	.align 4
-
-#include "zgemm_logic_power9.S"
-
-L999:
- 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
- 
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-	EPILOGUE
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD	ld
+ 
+#define STACKSIZE 512
+
+#define FZERO	312+192(SP)
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+ 
+ 
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define T10 r14
+
+#define L	r15
+#define T8	r16
+#define T5	r17
+#define T2	r19
+#define TEMP_REG	r20
+#define	T6	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define T7	r27
+#define	T3	r28
+#define T4	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	mr      FRAMEPOINTER, SP
+    addi    SP, SP, -STACKSIZE 
+    mflr    r0
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
+    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP)
+    stxv    vs59,  400(SP)
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP)
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
+
+    std    r0, FLINK_SAVE(SP)
+ 
+
+#if defined(linux) || defined(__FreeBSD__)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif 
+#endif
+
+
+#include "zgemm_macros_power9.S"
+
+ 
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE,  512 
+    li  r0,   0
+ 
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegdp alpha_r,alpha_r
+  xvnegdp alpha_i,alpha_i
+#endif
+	.align 4
+
+#include "zgemm_logic_power9.S"
+
+L999:
+ 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+	EPILOGUE
 #endif
--- a/kernel/power/zgemm_logic_power9.S
+++ b/kernel/power/zgemm_logic_power9.S
--- a/kernel/power/zgemm_macros_power9.S
+++ b/kernel/power/zgemm_macros_power9.S
--- a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
+++ b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
--- a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
+++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
--- a/kernel/x86_64/cgemm_kernel_8x2_sandy.S
+++ b/kernel/x86_64/cgemm_kernel_8x2_sandy.S
--- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S
--- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
--- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
+++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
--- a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
+++ b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
--- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
+++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
--- a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
+++ b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
--- a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
+++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
--- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
--- a/kernel/x86_64/sgemm_kernel_16x4_sandy.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_sandy.S
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c
@ -1,279 +1,279 @@
-#include "common.h"
-#include <stdint.h>
-#include "strsm_kernel_8x4_haswell_R_common.h"
-
-#define SOLVE_RN_m8n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\
-  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\
-  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\
-  SAVE_SOLUTION_m8n2(4,5,0)\
-  SOLVE_leri_m8n2(40,6,7,%1)\
-  SOLVE_ri_m8n2(56,6,7,%1)\
-  SAVE_SOLUTION_m8n2(6,7,64)
-
-#define SOLVE_RN_m8n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\
-  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(4,5,0)\
-  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(6,7,64)\
-  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(8,9,128)\
-  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(10,11,192)
-
-#define SOLVE_RN_m8n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\
-  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(4,5,0)\
-  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(6,7,64)\
-  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(8,9,128)\
-  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(10,11,192)\
-  SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(12,13,256)\
-  SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(14,15,320)
-
-#define SOLVE_RN_m4n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\
-  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\
-  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\
-  SAVE_SOLUTION_m4n2(4,0)\
-  SOLVE_leri_m4n2(40,5,%1)\
-  SOLVE_ri_m4n2(56,5,%1)\
-  SAVE_SOLUTION_m4n2(5,32)
-
-#define SOLVE_RN_m4n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\
-  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(4,0)\
-  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(5,32)\
-  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(6,64)\
-  SOLVE_leri_m4n2(104,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(120,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(7,96)
-
-#define SOLVE_RN_m4n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\
-  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(4,0)\
-  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(5,32)\
-  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(6,64)\
-  SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(7,96)\
-  SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(8,128)\
-  SOLVE_leri_m4n2(168,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(184,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(9,160)
-
-#define SOLVE_RN_m2n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\
-  SOLVE_col1_ltor_m2n4(0,4,5,%1)\
-  SOLVE_col2_ltor_m2n4(16,4,5,%1)\
-  SOLVE_col3_ltor_m2n4(32,4,5,%1)\
-  SOLVE_col4_ltor_m2n4(48,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,0)
-
-#define SOLVE_RN_m2n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\
-  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\
-  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\
-  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\
-  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m2n4(4,5,0)\
-  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\
-  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\
-  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\
-  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m2n4(6,7,32)
-
-#define SOLVE_RN_m2n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\
-  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\
-  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\
-  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\
-  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m2n4(4,5,0)\
-  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\
-  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\
-  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\
-  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m2n4(6,7,32)\
-  SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\
-  SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\
-  SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\
-  SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m2n4(8,9,64)
-
-#define SOLVE_RN_m1n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\
-  SOLVE_col1_ltor_m1n4(0,4,%1)\
-  SOLVE_col2_ltor_m1n4(16,4,%1)\
-  SOLVE_col3_ltor_m1n4(32,4,%1)\
-  SOLVE_col4_ltor_m1n4(48,4,%1)\
-  SAVE_SOLUTION_m1n4(4,0)
-
-#define SOLVE_RN_m1n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\
-  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\
-  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\
-  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\
-  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\
-  SAVE_SOLUTION_m1n4(4,0)\
-  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\
-  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\
-  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\
-  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\
-  SAVE_SOLUTION_m1n4(5,16)
-
-#define SOLVE_RN_m1n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\
-  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\
-  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\
-  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\
-  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\
-  SAVE_SOLUTION_m1n4(4,0)\
-  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\
-  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\
-  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\
-  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\
-  SAVE_SOLUTION_m1n4(5,16)\
-  SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\
-  SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\
-  SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\
-  SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\
-  SAVE_SOLUTION_m1n4(6,32)
-
-#define GEMM_RN_SIMPLE(mdim,ndim) \
-  "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
-  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
-  "1"#mdim""#ndim"1:\n\t"\
-  GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
-  "1"#mdim""#ndim"2:\n\t"
-#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4)
-#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8)
-#define GEMM_RN_m8n12 \
-  "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
-  "cmpq $8,%5; jb 18122f;"\
-  "18121:\n\t"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
-  "18122:\n\t"\
-  "testq %5,%5; jz 18124f;"\
-  "18123:\n\t"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
-  "18124:\n\t"
-#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4)
-#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8)
-#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12)
-#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4)
-#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8)
-#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12)
-#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4)
-#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8)
-#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12)
-
-#define COMPUTE(ndim) {\
-  __asm__ __volatile__(\
-    "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
-    "cmpq $8,%%r11; jb "#ndim"772f;"\
-    #ndim"771:\n\t"\
-    GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
-    #ndim"772:\n\t"\
-    "testq $4,%%r11; jz "#ndim"773f;"\
-    GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\
-    #ndim"773:\n\t"\
-    "testq $2,%%r11; jz "#ndim"774f;"\
-    GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\
-    #ndim"774:\n\t"\
-    "testq $1,%%r11; jz "#ndim"775f;"\
-    GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\
-    #ndim"775:\n\t"\
-    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
-  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
-  :"r11","r12","r13","r14","r15","cc","memory",\
-  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
-  a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\
-}
-
-static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-  FLOAT a0, b0;
-  int i, j, k;
-  for (i=0; i<n; i++) {
-    b0 = b[i*n+i];
-    for (j=0; j<m; j++) {
-      a0 = c[i*ldc+j] * b0;
-      a[i*m+j] = c[i*ldc+j] = a0;
-      for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k];
-    }
-  }
-}
-static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
-  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
-  for(;m_count>7;m_count-=8){
-    if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 8; c_ptr += 8;
-  }
-  for(;m_count>3;m_count-=4){
-    if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 4; c_ptr += 4;
-  }
-  for(;m_count>1;m_count-=2){
-    if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 2; c_ptr += 2;
-  }
-  if(m_count>0){
-    if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 1; c_ptr += 1;
-  }
-}
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
-  float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
-  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
-  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0;
-  BLASLONG n_count = n;
-  for(;n_count>11;n_count-=12) COMPUTE(12)
-  for(;n_count>7;n_count-=8) COMPUTE(8)
-  for(;n_count>3;n_count-=4) COMPUTE(4)
-  for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;}
-  if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF);
-  return 0;
-}
+#include "common.h"
+#include <stdint.h>
+#include "strsm_kernel_8x4_haswell_R_common.h"
+
+#define SOLVE_RN_m8n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\
+  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\
+  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\
+  SAVE_SOLUTION_m8n2(4,5,0)\
+  SOLVE_leri_m8n2(40,6,7,%1)\
+  SOLVE_ri_m8n2(56,6,7,%1)\
+  SAVE_SOLUTION_m8n2(6,7,64)
+
+#define SOLVE_RN_m8n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\
+  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(4,5,0)\
+  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(6,7,64)\
+  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(8,9,128)\
+  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(10,11,192)
+
+#define SOLVE_RN_m8n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\
+  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(4,5,0)\
+  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(6,7,64)\
+  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(8,9,128)\
+  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(10,11,192)\
+  SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(12,13,256)\
+  SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(14,15,320)
+
+#define SOLVE_RN_m4n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\
+  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\
+  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\
+  SAVE_SOLUTION_m4n2(4,0)\
+  SOLVE_leri_m4n2(40,5,%1)\
+  SOLVE_ri_m4n2(56,5,%1)\
+  SAVE_SOLUTION_m4n2(5,32)
+
+#define SOLVE_RN_m4n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\
+  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(4,0)\
+  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(5,32)\
+  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(6,64)\
+  SOLVE_leri_m4n2(104,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(120,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(7,96)
+
+#define SOLVE_RN_m4n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\
+  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(4,0)\
+  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(5,32)\
+  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(6,64)\
+  SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(7,96)\
+  SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(8,128)\
+  SOLVE_leri_m4n2(168,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(184,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(9,160)
+
+#define SOLVE_RN_m2n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\
+  SOLVE_col1_ltor_m2n4(0,4,5,%1)\
+  SOLVE_col2_ltor_m2n4(16,4,5,%1)\
+  SOLVE_col3_ltor_m2n4(32,4,5,%1)\
+  SOLVE_col4_ltor_m2n4(48,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,0)
+
+#define SOLVE_RN_m2n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\
+  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\
+  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\
+  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\
+  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m2n4(4,5,0)\
+  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\
+  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\
+  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\
+  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m2n4(6,7,32)
+
+#define SOLVE_RN_m2n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\
+  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\
+  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\
+  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\
+  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m2n4(4,5,0)\
+  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\
+  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\
+  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\
+  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m2n4(6,7,32)\
+  SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\
+  SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\
+  SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\
+  SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m2n4(8,9,64)
+
+#define SOLVE_RN_m1n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\
+  SOLVE_col1_ltor_m1n4(0,4,%1)\
+  SOLVE_col2_ltor_m1n4(16,4,%1)\
+  SOLVE_col3_ltor_m1n4(32,4,%1)\
+  SOLVE_col4_ltor_m1n4(48,4,%1)\
+  SAVE_SOLUTION_m1n4(4,0)
+
+#define SOLVE_RN_m1n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\
+  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\
+  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\
+  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\
+  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\
+  SAVE_SOLUTION_m1n4(4,0)\
+  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\
+  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\
+  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\
+  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\
+  SAVE_SOLUTION_m1n4(5,16)
+
+#define SOLVE_RN_m1n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\
+  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\
+  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\
+  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\
+  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\
+  SAVE_SOLUTION_m1n4(4,0)\
+  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\
+  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\
+  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\
+  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\
+  SAVE_SOLUTION_m1n4(5,16)\
+  SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\
+  SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\
+  SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\
+  SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\
+  SAVE_SOLUTION_m1n4(6,32)
+
+#define GEMM_RN_SIMPLE(mdim,ndim) \
+  "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
+  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
+  "1"#mdim""#ndim"1:\n\t"\
+  GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
+  "1"#mdim""#ndim"2:\n\t"
+#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4)
+#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8)
+#define GEMM_RN_m8n12 \
+  "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
+  "cmpq $8,%5; jb 18122f;"\
+  "18121:\n\t"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
+  "18122:\n\t"\
+  "testq %5,%5; jz 18124f;"\
+  "18123:\n\t"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
+  "18124:\n\t"
+#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4)
+#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8)
+#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12)
+#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4)
+#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8)
+#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12)
+#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4)
+#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8)
+#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12)
+
+#define COMPUTE(ndim) {\
+  __asm__ __volatile__(\
+    "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
+    "cmpq $8,%%r11; jb "#ndim"772f;"\
+    #ndim"771:\n\t"\
+    GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
+    #ndim"772:\n\t"\
+    "testq $4,%%r11; jz "#ndim"773f;"\
+    GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\
+    #ndim"773:\n\t"\
+    "testq $2,%%r11; jz "#ndim"774f;"\
+    GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\
+    #ndim"774:\n\t"\
+    "testq $1,%%r11; jz "#ndim"775f;"\
+    GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\
+    #ndim"775:\n\t"\
+    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
+  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
+  :"r11","r12","r13","r14","r15","cc","memory",\
+  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
+  a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\
+}
+
+static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+  FLOAT a0, b0;
+  int i, j, k;
+  for (i=0; i<n; i++) {
+    b0 = b[i*n+i];
+    for (j=0; j<m; j++) {
+      a0 = c[i*ldc+j] * b0;
+      a[i*m+j] = c[i*ldc+j] = a0;
+      for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k];
+    }
+  }
+}
+static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
+  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
+  for(;m_count>7;m_count-=8){
+    if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 8; c_ptr += 8;
+  }
+  for(;m_count>3;m_count-=4){
+    if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 4; c_ptr += 4;
+  }
+  for(;m_count>1;m_count-=2){
+    if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 2; c_ptr += 2;
+  }
+  if(m_count>0){
+    if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 1; c_ptr += 1;
+  }
+}
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
+  float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
+  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
+  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0;
+  BLASLONG n_count = n;
+  for(;n_count>11;n_count-=12) COMPUTE(12)
+  for(;n_count>7;n_count-=8) COMPUTE(8)
+  for(;n_count>3;n_count-=4) COMPUTE(4)
+  for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;}
+  if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF);
+  return 0;
+}
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c
@ -1,281 +1,281 @@
-#include "common.h"
-#include <stdint.h>
-#include "strsm_kernel_8x4_haswell_R_common.h"
-
-#define SOLVE_RT_m8n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
-  SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
-  SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
-  SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-48,4,5,%1)\
-  SOLVE_le_m8n2(-64,4,5,%1)\
-  SAVE_SOLUTION_m8n2(4,5,-128)
-
-#define SOLVE_RT_m8n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
-  SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
-  SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
-  SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
-  SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
-  SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
-  SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
-  SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-112,4,5,%1)\
-  SOLVE_le_m8n2(-128,4,5,%1)\
-  SAVE_SOLUTION_m8n2(4,5,-256)
-
-#define SOLVE_RT_m8n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
-  SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
-  SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
-  SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
-  SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
-  SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
-  SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
-  SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\
-  SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\
-  SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\
-  SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\
-  SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-176,4,5,%1)\
-  SOLVE_le_m8n2(-192,4,5,%1)\
-  SAVE_SOLUTION_m8n2(4,5,-384)
-
-#define SOLVE_RT_m4n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
-  SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
-  SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
-  SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-48,4,%1)\
-  SOLVE_le_m4n2(-64,4,%1)\
-  SAVE_SOLUTION_m4n2(4,-64)
-
-#define SOLVE_RT_m4n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
-  SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
-  SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
-  SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
-  SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
-  SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
-  SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
-  SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-112,4,%1)\
-  SOLVE_le_m4n2(-128,4,%1)\
-  SAVE_SOLUTION_m4n2(4,-128)
-
-#define SOLVE_RT_m4n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
-  SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
-  SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
-  SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
-  SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
-  SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
-  SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
-  SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\
-  SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\
-  SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\
-  SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\
-  SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-176,4,%1)\
-  SOLVE_le_m4n2(-192,4,%1)\
-  SAVE_SOLUTION_m4n2(4,-192)
-
-#define SOLVE_RT_m2n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
-  SOLVE_col4_rtol_m2n4(-16,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-32,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-48,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-64,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,-32)
-
-#define SOLVE_RT_m2n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
-  SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
-  SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m2n4(-80,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-96,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-112,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-128,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,-64)
-
-#define SOLVE_RT_m2n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
-  SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
-  SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\
-  SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m2n4(-144,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-160,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-176,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-192,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,-96)
-
-#define SOLVE_RT_m1n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
-  SOLVE_col4_rtol_m1n4(-16,4,%1)\
-  SOLVE_col3_rtol_m1n4(-32,4,%1)\
-  SOLVE_col2_rtol_m1n4(-48,4,%1)\
-  SOLVE_col1_rtol_m1n4(-64,4,%1)\
-  SAVE_SOLUTION_m1n4(4,-16)
-
-#define SOLVE_RT_m1n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
-  SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
-  SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
-  SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
-  SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
-  SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m1n4(-80,4,%1)\
-  SOLVE_col3_rtol_m1n4(-96,4,%1)\
-  SOLVE_col2_rtol_m1n4(-112,4,%1)\
-  SOLVE_col1_rtol_m1n4(-128,4,%1)\
-  SAVE_SOLUTION_m1n4(4,-32)
-
-#define SOLVE_RT_m1n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
-  SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
-  SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
-  SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
-  SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
-  SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\
-  SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\
-  SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\
-  SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\
-  SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m1n4(-144,4,%1)\
-  SOLVE_col3_rtol_m1n4(-160,4,%1)\
-  SOLVE_col2_rtol_m1n4(-176,4,%1)\
-  SOLVE_col1_rtol_m1n4(-192,4,%1)\
-  SAVE_SOLUTION_m1n4(4,-48)
-
-/* r14 = b_tail, r15 = a_tail, r13 = k-kk */
-#define GEMM_RT_SIMPLE(mdim,ndim) \
-  "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
-  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
-  "1"#mdim""#ndim"1:\n\t"\
-  "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\
-  "1"#mdim""#ndim"2:\n\t"
-#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4)
-#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8)
-#define GEMM_RT_m8n12 \
-  "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
-  "cmpq $8,%5; jb 18122f;"\
-  "18121:\n\t"\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
-  "18122:\n\t"\
-  "testq %5,%5; jz 18124f;"\
-  "18123:\n\t"\
-  "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\
-  "18124:\n\t"
-#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4)
-#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8)
-#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12)
-#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4)
-#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8)
-#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12)
-#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4)
-#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8)
-#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12)
-
-#define COMPUTE(ndim) {\
-  b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\
-  __asm__ __volatile__(\
-    "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\
-    "cmpq $8,%%r11; jb "#ndim"772f;"\
-    #ndim"771:\n\t"\
-    GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
-    #ndim"772:\n\t"\
-    "testq $4,%%r11; jz "#ndim"773f;"\
-    GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\
-    #ndim"773:\n\t"\
-    "testq $2,%%r11; jz "#ndim"774f;"\
-    GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\
-    #ndim"774:\n\t"\
-    "testq $1,%%r11; jz "#ndim"775f;"\
-    GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\
-    #ndim"775:\n\t"\
-    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
-  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
-  :"r11","r12","r13","r14","r15","cc","memory",\
-  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
-  a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\
-}
-
-static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){
-  FLOAT a0, b0;
-  int i, j, k;
-  for (i=n-1;i>=0;i--) {
-    b0 = b[i*n+i];
-    for (j=0;j<m;j++) {
-      a0 = c[i*ldc+j] * b0;
-      a[i*m+j] = c[i*ldc+j] = a0;
-      for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k];
-    }
-  }
-}
-static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
-  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
-  for(;m_count>7;m_count-=8){
-    if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
-    solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 8; c_ptr += 8;
-  }
-  for(;m_count>3;m_count-=4){
-    if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
-    solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 4; c_ptr += 4;
-  }
-  for(;m_count>1;m_count-=2){
-    if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
-    solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 2; c_ptr += 2;
-  }
-  if(m_count>0){
-    if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
-    solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 1; c_ptr += 1;
-  }
-}
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
-  float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C;
-  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
-  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0;
-  BLASLONG n_count = n;
-  if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;}
-  if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;}
-  for(;n_count>11;n_count-=12) COMPUTE(12)
-  for(;n_count>7;n_count-=8) COMPUTE(8)
-  for(;n_count>3;n_count-=4) COMPUTE(4)
-  return 0;
-}
+#include "common.h"
+#include <stdint.h>
+#include "strsm_kernel_8x4_haswell_R_common.h"
+
+#define SOLVE_RT_m8n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
+  SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
+  SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
+  SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-48,4,5,%1)\
+  SOLVE_le_m8n2(-64,4,5,%1)\
+  SAVE_SOLUTION_m8n2(4,5,-128)
+
+#define SOLVE_RT_m8n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
+  SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
+  SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
+  SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
+  SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
+  SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
+  SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
+  SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-112,4,5,%1)\
+  SOLVE_le_m8n2(-128,4,5,%1)\
+  SAVE_SOLUTION_m8n2(4,5,-256)
+
+#define SOLVE_RT_m8n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
+  SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
+  SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
+  SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
+  SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
+  SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
+  SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
+  SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\
+  SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\
+  SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\
+  SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\
+  SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-176,4,5,%1)\
+  SOLVE_le_m8n2(-192,4,5,%1)\
+  SAVE_SOLUTION_m8n2(4,5,-384)
+
+#define SOLVE_RT_m4n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
+  SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
+  SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
+  SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-48,4,%1)\
+  SOLVE_le_m4n2(-64,4,%1)\
+  SAVE_SOLUTION_m4n2(4,-64)
+
+#define SOLVE_RT_m4n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
+  SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
+  SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
+  SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
+  SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
+  SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
+  SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
+  SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-112,4,%1)\
+  SOLVE_le_m4n2(-128,4,%1)\
+  SAVE_SOLUTION_m4n2(4,-128)
+
+#define SOLVE_RT_m4n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
+  SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
+  SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
+  SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
+  SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
+  SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
+  SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
+  SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\
+  SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\
+  SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\
+  SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\
+  SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-176,4,%1)\
+  SOLVE_le_m4n2(-192,4,%1)\
+  SAVE_SOLUTION_m4n2(4,-192)
+
+#define SOLVE_RT_m2n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
+  SOLVE_col4_rtol_m2n4(-16,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-32,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-48,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-64,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,-32)
+
+#define SOLVE_RT_m2n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
+  SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
+  SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m2n4(-80,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-96,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-112,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-128,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,-64)
+
+#define SOLVE_RT_m2n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
+  SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
+  SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\
+  SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m2n4(-144,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-160,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-176,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-192,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,-96)
+
+#define SOLVE_RT_m1n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
+  SOLVE_col4_rtol_m1n4(-16,4,%1)\
+  SOLVE_col3_rtol_m1n4(-32,4,%1)\
+  SOLVE_col2_rtol_m1n4(-48,4,%1)\
+  SOLVE_col1_rtol_m1n4(-64,4,%1)\
+  SAVE_SOLUTION_m1n4(4,-16)
+
+#define SOLVE_RT_m1n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
+  SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
+  SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
+  SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
+  SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
+  SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m1n4(-80,4,%1)\
+  SOLVE_col3_rtol_m1n4(-96,4,%1)\
+  SOLVE_col2_rtol_m1n4(-112,4,%1)\
+  SOLVE_col1_rtol_m1n4(-128,4,%1)\
+  SAVE_SOLUTION_m1n4(4,-32)
+
+#define SOLVE_RT_m1n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
+  SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
+  SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
+  SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
+  SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
+  SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\
+  SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\
+  SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\
+  SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\
+  SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m1n4(-144,4,%1)\
+  SOLVE_col3_rtol_m1n4(-160,4,%1)\
+  SOLVE_col2_rtol_m1n4(-176,4,%1)\
+  SOLVE_col1_rtol_m1n4(-192,4,%1)\
+  SAVE_SOLUTION_m1n4(4,-48)
+
+/* r14 = b_tail, r15 = a_tail, r13 = k-kk */
+#define GEMM_RT_SIMPLE(mdim,ndim) \
+  "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
+  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
+  "1"#mdim""#ndim"1:\n\t"\
+  "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\
+  "1"#mdim""#ndim"2:\n\t"
+#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4)
+#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8)
+#define GEMM_RT_m8n12 \
+  "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
+  "cmpq $8,%5; jb 18122f;"\
+  "18121:\n\t"\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
+  "18122:\n\t"\
+  "testq %5,%5; jz 18124f;"\
+  "18123:\n\t"\
+  "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\
+  "18124:\n\t"
+#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4)
+#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8)
+#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12)
+#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4)
+#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8)
+#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12)
+#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4)
+#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8)
+#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12)
+
+#define COMPUTE(ndim) {\
+  b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\
+  __asm__ __volatile__(\
+    "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\
+    "cmpq $8,%%r11; jb "#ndim"772f;"\
+    #ndim"771:\n\t"\
+    GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
+    #ndim"772:\n\t"\
+    "testq $4,%%r11; jz "#ndim"773f;"\
+    GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\
+    #ndim"773:\n\t"\
+    "testq $2,%%r11; jz "#ndim"774f;"\
+    GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\
+    #ndim"774:\n\t"\
+    "testq $1,%%r11; jz "#ndim"775f;"\
+    GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\
+    #ndim"775:\n\t"\
+    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
+  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
+  :"r11","r12","r13","r14","r15","cc","memory",\
+  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
+  a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\
+}
+
+static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){
+  FLOAT a0, b0;
+  int i, j, k;
+  for (i=n-1;i>=0;i--) {
+    b0 = b[i*n+i];
+    for (j=0;j<m;j++) {
+      a0 = c[i*ldc+j] * b0;
+      a[i*m+j] = c[i*ldc+j] = a0;
+      for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k];
+    }
+  }
+}
+static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
+  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
+  for(;m_count>7;m_count-=8){
+    if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
+    solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 8; c_ptr += 8;
+  }
+  for(;m_count>3;m_count-=4){
+    if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
+    solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 4; c_ptr += 4;
+  }
+  for(;m_count>1;m_count-=2){
+    if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
+    solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 2; c_ptr += 2;
+  }
+  if(m_count>0){
+    if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
+    solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 1; c_ptr += 1;
+  }
+}
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
+  float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C;
+  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
+  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0;
+  BLASLONG n_count = n;
+  if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;}
+  if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;}
+  for(;n_count>11;n_count-=12) COMPUTE(12)
+  for(;n_count>7;n_count-=8) COMPUTE(8)
+  for(;n_count>3;n_count-=4) COMPUTE(4)
+  return 0;
+}
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
@ -1,226 +1,226 @@
-/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
-/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
-/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
-
-#define init_m8n4(c1,c2,c3,c4)\
-  "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
-  "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
-#define INIT_m8n4 init_m8n4(4,5,6,7)
-#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
-#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
-
-#define init_m4n4(c1,c2,c3,c4)\
-  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
-  "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
-#define INIT_m4n4 init_m4n4(4,5,6,7)
-#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
-#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
-
-#define init_m2n4(c1,c2)\
-  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
-#define INIT_m2n4 init_m2n4(4,5)
-#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
-#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
-
-#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
-#define INIT_m1n4 init_m1n4(4)
-#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
-#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
-
-#define GEMM_KERNEL_k1m8n4 \
-  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
-  "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
-  "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
-#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
-  "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
-  "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
-#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
-  "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
-  "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
-
-#define GEMM_KERNEL_k1m4n4 \
-  "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
-  "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
-  "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
-#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
-  "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
-  "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
-#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
-  "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
-  "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
-
-#define GEMM_KERNEL_k1m2n4 \
-  "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
-  "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
-#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
-  "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
-#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
-  "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
-
-#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
-#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
-#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
-
-#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
-  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
-  "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
-  "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
-
-#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
-  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
-  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
-  "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
-  "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
-  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
-  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
-  "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
-  "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
-  "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
-
-#define GEMM_SUM_REORDER_2x4(c1,c2)\
-  "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
-  "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
-  "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
-
-#define GEMM_SUM_REORDER_1x4(c1)\
-  "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
-  "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
-  "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_le_m4n2(b_off,c1,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovsldup %%ymm"#c1",%%ymm1;"
-
-#define SOLVE_le_m8n2(b_off,c1,c2,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
-
-#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
-  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SOLVE_ri_m4n2(b_off,c1,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovshdup %%ymm"#c1",%%ymm1;"
-
-#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
-
-#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
-  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $0,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $85,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $170,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $255,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
-  "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
-  "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
-  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
-  "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m4n2(c1,a_off)\
-  "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
-  "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m1n4(c1,a_off)\
-  "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
+/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
+/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
+/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
+
+#define init_m8n4(c1,c2,c3,c4)\
+  "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
+  "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
+#define INIT_m8n4 init_m8n4(4,5,6,7)
+#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
+#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
+
+#define init_m4n4(c1,c2,c3,c4)\
+  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
+  "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
+#define INIT_m4n4 init_m4n4(4,5,6,7)
+#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
+#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
+
+#define init_m2n4(c1,c2)\
+  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
+#define INIT_m2n4 init_m2n4(4,5)
+#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
+#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
+
+#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
+#define INIT_m1n4 init_m1n4(4)
+#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
+#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
+
+#define GEMM_KERNEL_k1m8n4 \
+  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
+  "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
+  "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
+#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
+  "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
+  "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
+#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
+  "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
+  "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
+
+#define GEMM_KERNEL_k1m4n4 \
+  "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
+  "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
+  "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
+#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
+  "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
+  "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
+#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
+  "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
+  "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
+
+#define GEMM_KERNEL_k1m2n4 \
+  "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
+  "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
+#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
+  "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
+#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
+  "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
+
+#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
+#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
+#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
+
+#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
+  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
+  "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
+  "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
+
+#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
+  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
+  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
+  "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
+  "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
+  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
+  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
+  "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
+  "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
+  "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
+
+#define GEMM_SUM_REORDER_2x4(c1,c2)\
+  "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
+  "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
+  "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
+
+#define GEMM_SUM_REORDER_1x4(c1)\
+  "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
+  "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
+  "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_le_m4n2(b_off,c1,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovsldup %%ymm"#c1",%%ymm1;"
+
+#define SOLVE_le_m8n2(b_off,c1,c2,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
+
+#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
+  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SOLVE_ri_m4n2(b_off,c1,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovshdup %%ymm"#c1",%%ymm1;"
+
+#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
+
+#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
+  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $0,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $85,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $170,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $255,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
+  "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
+  "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
+  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
+  "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m4n2(c1,a_off)\
+  "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
+  "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m1n4(c1,a_off)\
+  "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
--- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
--- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
--- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
--- a/relapack/src/CMakeLists.txt
+++ b/relapack/src/CMakeLists.txt
@ -1,86 +1,86 @@
-include_directories(${PROJECT_SOURCE_DIR})
-include_directories(${PROJECT_BINARY_DIR})
-include_directories(${PROJECT_SOURCE_DIR}/relapack)
-
-set(RELAFILES
-clauum.c
-ctrsyl_rec2.c
-dsytrf.c
-spbtrf.c
-strsyl_rec2.c
-zhetrf_rook_rec2.c
-ztrsyl.c
-cgbtrf.c
-cpbtrf.c
-ctrtri.c
-dsytrf_rec2.c
-spotrf.c
-strtri.c
-zlauum.c
-ztrsyl_rec2.c
-cgemmt.c
-cpotrf.c
-dgbtrf.c
-dsytrf_rook.c
-lapack_wrappers.c
-ssygst.c
-zgbtrf.c
-zpbtrf.c
-ztrtri.c
-cgetrf.c
-csytrf.c
-dgemmt.c
-dsytrf_rook_rec2.c
-ssytrf.c
-zgemmt.c
-zpotrf.c
-chegst.c
-csytrf_rec2.c
-dgetrf.c
-dtgsyl.c
-ssytrf_rec2.c
-zgetrf.c
-zsytrf.c
-chetrf.c
-csytrf_rook.c
-dlauum.c
-dtrsyl.c
-sgbtrf.c
-ssytrf_rook.c
-zhegst.c
-zsytrf_rec2.c
-chetrf_rec2.c
-csytrf_rook_rec2.c
-dpbtrf.c
-dtrsyl_rec2.c
-sgemmt.c
-ssytrf_rook_rec2.c
-zhetrf.c
-zsytrf_rook.c
-chetrf_rook.c
-ctgsyl.c
-dpotrf.c
-dtrtri.c
-sgetrf.c
-stgsyl.c
-zhetrf_rec2.c
-zsytrf_rook_rec2.c
-chetrf_rook_rec2.c
-ctrsyl.c
-dsygst.c
-f2c.c
-slauum.c
-strsyl.c
-zhetrf_rook.c
-ztgsyl.c
-)
-
-
-
-# add relapack folder to the sources
-set(RELA_SOURCES "")
-foreach (RELA_FILE ${RELAFILES})
-  list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}")
-endforeach ()
-add_library(relapack_src OBJECT ${RELA_SOURCES})
-set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${PROJECT_BINARY_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/relapack)
+
+set(RELAFILES
+clauum.c
+ctrsyl_rec2.c
+dsytrf.c
+spbtrf.c
+strsyl_rec2.c
+zhetrf_rook_rec2.c
+ztrsyl.c
+cgbtrf.c
+cpbtrf.c
+ctrtri.c
+dsytrf_rec2.c
+spotrf.c
+strtri.c
+zlauum.c
+ztrsyl_rec2.c
+cgemmt.c
+cpotrf.c
+dgbtrf.c
+dsytrf_rook.c
+lapack_wrappers.c
+ssygst.c
+zgbtrf.c
+zpbtrf.c
+ztrtri.c
+cgetrf.c
+csytrf.c
+dgemmt.c
+dsytrf_rook_rec2.c
+ssytrf.c
+zgemmt.c
+zpotrf.c
+chegst.c
+csytrf_rec2.c
+dgetrf.c
+dtgsyl.c
+ssytrf_rec2.c
+zgetrf.c
+zsytrf.c
+chetrf.c
+csytrf_rook.c
+dlauum.c
+dtrsyl.c
+sgbtrf.c
+ssytrf_rook.c
+zhegst.c
+zsytrf_rec2.c
+chetrf_rec2.c
+csytrf_rook_rec2.c
+dpbtrf.c
+dtrsyl_rec2.c
+sgemmt.c
+ssytrf_rook_rec2.c
+zhetrf.c
+zsytrf_rook.c
+chetrf_rook.c
+ctgsyl.c
+dpotrf.c
+dtrtri.c
+sgetrf.c
+stgsyl.c
+zhetrf_rec2.c
+zsytrf_rook_rec2.c
+chetrf_rook_rec2.c
+ctrsyl.c
+dsygst.c
+f2c.c
+slauum.c
+strsyl.c
+zhetrf_rook.c
+ztgsyl.c
+)
+
+
+
+# add relapack folder to the sources
+set(RELA_SOURCES "")
+foreach (RELA_FILE ${RELAFILES})
+  list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}")
+endforeach ()
+add_library(relapack_src OBJECT ${RELA_SOURCES})
+set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")