Merge pull request #91 from xianyi/develop

rebase
2020-09-28 22:48:53 +02:00 · 2020-09-28 22:48:53 +02:00 · 64629cb5c7
parent caf7a12295 0d98ce202c
commit 64629cb5c7
14 changed files with 764 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -46,7 +46,10 @@ Building OpenBLAS requires the following to be installed:

 Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
 To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
-The full target list is in the file `TargetList.txt`.
+The full target list is in the file `TargetList.txt`. For building with `cmake`, the
+usual conventions apply, i.e. create a build directory either underneath the toplevel
+OpenBLAS source directory or separate from it, and invoke `cmake` there with the path
+to the source tree and any build options you plan to set.

 ### Cross compile

@ -152,13 +155,17 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Falkor**: same as A57 (different cpu specifications)
 - **ThunderX**: Optimized some Level-1 functions
 - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
+- **ThunderX3T110**
 - **TSV110**: Optimized some Level-3 helper functions
 - **EMAG 8180**: preliminary support based on A57
+- **Neoverse N1**: (AWS Graviton2) preliminary support
+- **Apple Vortex**: preliminary support based on ARMV8

 #### PPC/PPC64

 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
 - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. 
+- **POWER10**:

 #### IBM zEnterprise System

@ -226,7 +233,8 @@ We provide the following functions to control the number of threads at runtime:
 void goto_set_num_threads(int num_threads);
 void openblas_set_num_threads(int num_threads);
 ```
-
+Note that these are only used once at library initialization, and are not available for
+fine-tuning thread numbers in individual BLAS calls. 
 If you compile this library with `USE_OPENMP=1`, you should use the above functions too.

 ## Reporting bugs
--- a/getarch.c
+++ b/getarch.c
@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
-                     "-DHAVE_AVX -DHAVE_FMA4"
+                     "-DHAVE_AVX"
 #define LIBNAME   "bulldozer"
 #define CORENAME  "BULLDOZER"
 #endif
@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
-                     "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
+                     "-DHAVE_AVX -DHAVE_FMA3"
 #define LIBNAME   "piledriver"
 #define CORENAME  "PILEDRIVER"
 #endif
@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
-                     "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
+                     "-DHAVE_AVX -DHAVE_FMA3"
 #define LIBNAME   "steamroller"
 #define CORENAME  "STEAMROLLER"
 #endif
@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
-                     "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
+                     "-DHAVE_AVX -DHAVE_FMA3"
 #define LIBNAME   "excavator"
 #define CORENAME  "EXCAVATOR"
 #endif
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@ -151,9 +151,9 @@ endif
 ZAXPYKERNEL  = zaxpy_power10.c
 #
 SCOPYKERNEL  = scopy.c
-DCOPYKERNEL  = dcopy.c
+DCOPYKERNEL  = dcopy_power10.c
 CCOPYKERNEL  = ccopy.c
-ZCOPYKERNEL  = zcopy.c
+ZCOPYKERNEL  = zcopy_power10.c
 #
 SDOTKERNEL   =  sdot.c
 DDOTKERNEL   =  ddot.c
--- a/kernel/power/dcopy_microk_power10.c
+++ b/kernel/power/dcopy_microk_power10.c
@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_64 1
+
+static void dcopy_kernel_64 (long n, double *x, double *y)
+{
+  __asm__
+    (
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+       "lxvp		40, 128(%2)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
+       "lxvp		46, 224(%2)	\n\t"
+
+       "lxvp		48, 256(%2)	\n\t"
+       "lxvp		50, 288(%2)	\n\t"
+       "lxvp		52, 320(%2)	\n\t"
+       "lxvp		54, 352(%2)	\n\t"
+       "lxvp		56, 384(%2)	\n\t"
+       "lxvp		58, 416(%2)	\n\t"
+       "lxvp		60, 448(%2)	\n\t"
+       "lxvp		62, 480(%2)	\n\t"
+       "addi		%2, %2, 512	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "stxvp		32, 0(%3)	\n\t"
+       "lxvp		32, 0(%2)	\n\t"
+       "stxvp		34, 32(%3)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "stxvp		36, 64(%3)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "stxvp		38, 96(%3)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+
+       "stxvp		40, 128(%3)	\n\t"
+       "lxvp		40, 128(%2)	\n\t"
+       "stxvp		42, 160(%3)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "stxvp		44, 192(%3)	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
+       "stxvp		46, 224(%3)	\n\t"
+       "lxvp		46, 224(%2)	\n\t"
+
+       "stxvp		48, 256(%3)	\n\t"
+       "lxvp		48, 256(%2)	\n\t"
+       "stxvp		50, 288(%3)	\n\t"
+       "lxvp		50, 288(%2)	\n\t"
+       "stxvp		52, 320(%3)	\n\t"
+       "lxvp		52, 320(%2)	\n\t"
+       "stxvp		54, 352(%3)	\n\t"
+       "lxvp		54, 352(%2)	\n\t"
+       "stxvp		56, 384(%3)	\n\t"
+       "lxvp		56, 384(%2)	\n\t"
+       "stxvp		58, 416(%3)	\n\t"
+       "lxvp		58, 416(%2)	\n\t"
+       "stxvp		60, 448(%3)	\n\t"
+       "lxvp		60, 448(%2)	\n\t"
+       "stxvp		62, 480(%3)	\n\t"
+       "lxvp		62, 480(%2)	\n\t"
+
+       "addi		%3, %3, 512	\n\t"
+       "addi		%2, %2, 512	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "stxvp		32, 0(%3)	\n\t"
+       "stxvp		34, 32(%3)	\n\t"
+       "stxvp		36, 64(%3)	\n\t"
+       "stxvp		38, 96(%3)	\n\t"
+       "stxvp		40, 128(%3)	\n\t"
+       "stxvp		42, 160(%3)	\n\t"
+       "stxvp		44, 192(%3)	\n\t"
+       "stxvp		46, 224(%3)	\n\t"
+       "stxvp		48, 256(%3)	\n\t"
+       "stxvp		50, 288(%3)	\n\t"
+       "stxvp		52, 320(%3)	\n\t"
+       "stxvp		54, 352(%3)	\n\t"
+       "stxvp		56, 384(%3)	\n\t"
+       "stxvp		58, 416(%3)	\n\t"
+       "stxvp		60, 448(%3)	\n\t"
+       "stxvp		62, 480(%3)	\n\t"
+
+     "#n=%1 x=%4=%2 y=%0=%3"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+}
--- a/kernel/power/dcopy_power10.c
+++ b/kernel/power/dcopy_power10.c
@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "dcopy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL_64
+
+static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=8;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -64;
+		if ( n1 > 0 )
+		{
+			dcopy_kernel_64(n1, x, y);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			y[i] = x[i] ;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
--- a/kernel/power/zcopy_microk_power10.c
+++ b/kernel/power/zcopy_microk_power10.c
@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void zcopy_kernel_32 (long n, double *x, double *y)
+{
+  __asm__
+    (
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+       "lxvp		40, 128(%2)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
+       "lxvp		46, 224(%2)	\n\t"
+
+       "lxvp		48, 256(%2)	\n\t"
+       "lxvp		50, 288(%2)	\n\t"
+       "lxvp		52, 320(%2)	\n\t"
+       "lxvp		54, 352(%2)	\n\t"
+       "lxvp		56, 384(%2)	\n\t"
+       "lxvp		58, 416(%2)	\n\t"
+       "lxvp		60, 448(%2)	\n\t"
+       "lxvp		62, 480(%2)	\n\t"
+       "addi		%2, %2, 512	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "stxvp		32, 0(%3)	\n\t"
+       "lxvp		32, 0(%2)	\n\t"
+       "stxvp		34, 32(%3)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "stxvp		36, 64(%3)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "stxvp		38, 96(%3)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+
+       "stxvp		40, 128(%3)	\n\t"
+       "lxvp		40, 128(%2)	\n\t"
+       "stxvp		42, 160(%3)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "stxvp		44, 192(%3)	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
+       "stxvp		46, 224(%3)	\n\t"
+       "lxvp		46, 224(%2)	\n\t"
+
+       "stxvp		48, 256(%3)	\n\t"
+       "lxvp		48, 256(%2)	\n\t"
+       "stxvp		50, 288(%3)	\n\t"
+       "lxvp		50, 288(%2)	\n\t"
+       "stxvp		52, 320(%3)	\n\t"
+       "lxvp		52, 320(%2)	\n\t"
+       "stxvp		54, 352(%3)	\n\t"
+       "lxvp		54, 352(%2)	\n\t"
+       "stxvp		56, 384(%3)	\n\t"
+       "lxvp		56, 384(%2)	\n\t"
+       "stxvp		58, 416(%3)	\n\t"
+       "lxvp		58, 416(%2)	\n\t"
+       "stxvp		60, 448(%3)	\n\t"
+       "lxvp		60, 448(%2)	\n\t"
+       "stxvp		62, 480(%3)	\n\t"
+       "lxvp		62, 480(%2)	\n\t"
+
+       "addi		%3, %3, 512	\n\t"
+       "addi		%2, %2, 512	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "stxvp		32, 0(%3)	\n\t"
+       "stxvp		34, 32(%3)	\n\t"
+       "stxvp		36, 64(%3)	\n\t"
+       "stxvp		38, 96(%3)	\n\t"
+       "stxvp		40, 128(%3)	\n\t"
+       "stxvp		42, 160(%3)	\n\t"
+       "stxvp		44, 192(%3)	\n\t"
+       "stxvp		46, 224(%3)	\n\t"
+       "stxvp		48, 256(%3)	\n\t"
+       "stxvp		50, 288(%3)	\n\t"
+       "stxvp		52, 320(%3)	\n\t"
+       "stxvp		54, 352(%3)	\n\t"
+       "stxvp		56, 384(%3)	\n\t"
+       "stxvp		58, 416(%3)	\n\t"
+       "stxvp		60, 448(%3)	\n\t"
+       "stxvp		62, 480(%3)	\n\t"
+
+     "#n=%1 x=%4=%2 y=%0=%3"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+}
--- a/kernel/power/zcopy_power10.c
+++ b/kernel/power/zcopy_power10.c
@ -0,0 +1,132 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "zcopy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -32;
+		if ( n1 > 0 )
+		{
+			zcopy_kernel_32(n1, x, y);
+			i=n1;
+			ix=n1*2;
+			iy=n1*2;
+		}
+
+		while(i < n)
+		{
+			y[iy] = x[iy] ;
+			y[iy+1] = x[ix+1] ;
+			ix+=2;
+			iy+=2;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		BLASLONG inc_x2 = 2 * inc_x;
+		BLASLONG inc_y2 = 2 * inc_y;
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			y[iy+1] = x[ix+1] ;
+			ix += inc_x2 ;
+			iy += inc_y2 ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
--- a/kernel/simd/intrin.h
+++ b/kernel/simd/intrin.h
@ -0,0 +1,71 @@
+#ifndef _INTRIN_H_
+#define _INTRIN_H_
+
+#if defined(_MSC_VER)
+#define BLAS_INLINE __inline
+#elif defined(__GNUC__)
+#if defined(__STRICT_ANSI__)
+#define BLAS_INLINE __inline__
+#else
+#define BLAS_INLINE inline
+#endif
+#else
+#define BLAS_INLINE
+#endif
+
+#ifdef _MSC_VER
+#define BLAS_FINLINE static __forceinline
+#elif defined(__GNUC__)
+#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline))
+#else
+#define BLAS_FINLINE static
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+// include head
+/** SSE **/
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
+#endif
+/** SSE2 **/
+#ifdef HAVE_SSE2
+#include <emmintrin.h>
+#endif
+/** SSE3 **/
+#ifdef HAVE_SSE3
+#include <pmmintrin.h>
+#endif
+/** SSSE3 **/
+#ifdef HAVE_SSSE3
+#include <tmmintrin.h>
+#endif
+/** SSE41 **/
+#ifdef HAVE_SSE4_1
+#include <smmintrin.h>
+#endif
+
+/** AVX **/
+#ifdef HAVE_AVX
+#include <immintrin.h>
+#endif
+
+// distribute
+#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16)
+#include "intrin_avx512.h"
+#elif defined(HAVE_AVX2)
+#include "intrin_avx.h"
+#elif defined(HAVE_SSE2)
+#include "intrin_sse.h"
+#endif
+
+#ifndef V_SIMD
+    #define V_SIMD 0
+    #define V_SIMD_F64 0
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif // _INTRIN_H_
--- a/kernel/simd/intrin_avx.h
+++ b/kernel/simd/intrin_avx.h
@ -0,0 +1,29 @@
+#define V_SIMD 256
+#define V_SIMD_F64 1
+/*
+Data Type
+*/
+typedef __m256  v_f32;
+#define v_nlanes_f32 8
+/*
+arithmetic
+*/
+#define v_add_f32 _mm256_add_ps
+#define v_mul_f32 _mm256_mul_ps
+
+#ifdef HAVE_FMA3
+    // multiply and add, a*b + c
+    #define v_muladd_f32 _mm256_fmadd_ps
+#else
+    // multiply and add, a*b + c
+    BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_add_f32(v_mul_f32(a, b), c); }
+#endif // !HAVE_FMA3
+
+/*
+memory
+*/
+// unaligned load
+#define v_loadu_f32 _mm256_loadu_ps
+#define v_storeu_f32 _mm256_storeu_ps
+#define v_setall_f32(VAL) _mm256_set1_ps(VAL)
--- a/kernel/simd/intrin_avx512.h
+++ b/kernel/simd/intrin_avx512.h
@ -0,0 +1,21 @@
+#define V_SIMD 512
+#define V_SIMD_F64 1
+/*
+Data Type
+*/
+typedef __m512  v_f32;
+#define v_nlanes_f32 16
+/*
+arithmetic
+*/
+#define v_add_f32 _mm512_add_ps
+#define v_mul_f32 _mm512_mul_ps
+// multiply and add, a*b + c
+#define v_muladd_f32 _mm512_fmadd_ps
+/*
+memory
+*/
+// unaligned load
+#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
+#define v_storeu_f32 _mm512_storeu_ps
+#define v_setall_f32(VAL) _mm512_set1_ps(VAL)
--- a/kernel/simd/intrin_sse.h
+++ b/kernel/simd/intrin_sse.h
@ -0,0 +1,30 @@
+#define V_SIMD 128
+#define V_SIMD_F64 1
+/*
+Data Type
+*/
+typedef __m128  v_f32;
+#define v_nlanes_f32 4
+/*
+arithmetic
+*/
+#define v_add_f32 _mm_add_ps
+#define v_mul_f32 _mm_mul_ps
+#ifdef HAVE_FMA3
+    // multiply and add, a*b + c
+    #define v_muladd_f32 _mm_fmadd_ps
+#elif defined(HAVE_FMA4)
+    // multiply and add, a*b + c
+    #define v_muladd_f32 _mm_macc_ps
+#else
+    // multiply and add, a*b + c
+    BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_add_f32(v_mul_f32(a, b), c); }
+#endif // HAVE_FMA3
+/*
+memory
+*/
+// unaligned load
+#define v_loadu_f32 _mm_loadu_ps
+#define v_storeu_f32 _mm_storeu_ps
+#define v_setall_f32(VAL) _mm_set1_ps(VAL)
--- a/kernel/x86_64/daxpy.c
+++ b/kernel/x86_64/daxpy.c
@ -45,14 +45,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "daxpy_microk_sandy-2.c"
 #endif

-
 #ifndef HAVE_KERNEL_8
+#include"../simd/intrin.h"

 static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {
 	BLASLONG register i = 0;
 	FLOAT a = *alpha;
-
+#if V_SIMD
+	v_f32 __alpha, tmp;
+	__alpha =  v_setall_f32(*alpha);
+	const int vstep = v_nlanes_f32;
+	for (; i < n; i += vstep) {
+		tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
+		v_storeu_f32(y + i, tmp);
+	}
+#else
 	while(i < n)
 	{
 		y[i]   += a * x[i];
@ -64,9 +72,8 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 		y[i+6] += a * x[i+6];
 		y[i+7] += a * x[i+7];
 		i+=8 ;
-
 	}
-
+#endif
 }

 #endif
--- a/lapack-netlib/SRC/dlanv2.f
+++ b/lapack-netlib/SRC/dlanv2.f
@ -140,13 +140,16 @@
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0 )
+      PARAMETER          ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0,
+     $                     TWO = 2.0D0 )
      DOUBLE PRECISION   MULTPL
      PARAMETER          ( MULTPL = 4.0D+0 )
 *     ..
 *     .. Local Scalars ..
      DOUBLE PRECISION   AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
-     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z
+     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, 
+     $                   SAFMN2, SAFMX2
+      INTEGER            COUNT
 *     ..
 *     .. External Functions ..
      DOUBLE PRECISION   DLAMCH, DLAPY2
@ -157,7 +160,11 @@
 *     ..
 *     .. Executable Statements ..
 *
+      SAFMIN = DLAMCH( 'S' )
      EPS = DLAMCH( 'P' )
+      SAFMN2 = DLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) /
+     $            LOG( DLAMCH( 'B' ) ) / TWO )
+      SAFMX2 = ONE / SAFMN2
      IF( C.EQ.ZERO ) THEN
         CS = ONE
         SN = ZERO
@ -212,7 +219,24 @@
 *           Complex eigenvalues, or real (almost) equal eigenvalues.
 *           Make diagonal elements equal.
 *
+            COUNT = 0
            SIGMA = B + C
+   10       CONTINUE
+            COUNT = COUNT + 1
+            SCALE = MAX( ABS(TEMP), ABS(SIGMA) )
+            IF( SCALE.GE.SAFMX2 ) THEN
+               SIGMA = SIGMA * SAFMN2
+               TEMP = TEMP * SAFMN2
+               IF (COUNT .LE. 20)
+     $            GOTO 10
+            END IF
+            IF( SCALE.LE.SAFMN2 ) THEN
+               SIGMA = SIGMA * SAFMX2
+               TEMP = TEMP * SAFMX2
+               IF (COUNT .LE. 20)
+     $            GOTO 10
+            END IF
+            P = HALF*TEMP
            TAU = DLAPY2( SIGMA, TEMP )
            CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) )
            SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA )
--- a/lapack-netlib/SRC/slanv2.f
+++ b/lapack-netlib/SRC/slanv2.f
@ -140,13 +140,16 @@
 *
 *     .. Parameters ..
      REAL               ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0 )
+      PARAMETER          ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0,
+     $                     TWO = 2.0E+0 )
      REAL               MULTPL
      PARAMETER          ( MULTPL = 4.0E+0 )
 *     ..
 *     .. Local Scalars ..
      REAL               AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
-     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z
+     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, 
+     $                   SAFMN2, SAFMX2
+      INTEGER            COUNT
 *     ..
 *     .. External Functions ..
      REAL               SLAMCH, SLAPY2
@ -157,7 +160,11 @@
 *     ..
 *     .. Executable Statements ..
 *
+      SAFMIN = SLAMCH( 'S' )
      EPS = SLAMCH( 'P' )
+      SAFMN2 = SLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) /
+     $            LOG( SLAMCH( 'B' ) ) / TWO )
+      SAFMX2 = ONE / SAFMN2
      IF( C.EQ.ZERO ) THEN
         CS = ONE
         SN = ZERO
@ -212,7 +219,24 @@
 *           Complex eigenvalues, or real (almost) equal eigenvalues.
 *           Make diagonal elements equal.
 *
+            COUNT = 0
            SIGMA = B + C
+   10       CONTINUE
+            COUNT = COUNT + 1
+            SCALE = MAX( ABS(TEMP), ABS(SIGMA) )
+            IF( SCALE.GE.SAFMX2 ) THEN
+               SIGMA = SIGMA * SAFMN2
+               TEMP = TEMP * SAFMN2
+               IF (COUNT .LE. 20)
+     $            GOTO 10
+            END IF
+            IF( SCALE.LE.SAFMN2 ) THEN
+               SIGMA = SIGMA * SAFMX2
+               TEMP = TEMP * SAFMX2
+               IF (COUNT .LE. 20)
+     $            GOTO 10
+            END IF
+            P = HALF*TEMP
            TAU = SLAPY2( SIGMA, TEMP )
            CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) )
            SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA )