eeecd623d8 
								
							 
						 
						
							
							
								
								Update cgemm_kernel_8x2_haswell.c  
							
							
							
						 
						
							2019-12-24 00:40:16 +08:00  
				
					
						
							
							
								 
						
							
								2cd9306bb5 
								
							 
						 
						
							
							
								
								Update KERNEL.ZEN  
							
							
							
						 
						
							2019-12-23 23:42:30 +08:00  
				
					
						
							
							
								 
						
							
								c418c81224 
								
							 
						 
						
							
							
								
								Update KERNEL.HASWELL  
							
							
							
						 
						
							2019-12-23 23:41:44 +08:00  
				
					
						
							
							
								 
						
							
								025741f16a 
								
							 
						 
						
							
							
								
								Fast Haswell CGEMM kernel  
							
							
							
						 
						
							2019-12-23 23:40:03 +08:00  
				
					
						
							
							
								 
						
							
								f41d52665d 
								
							 
						 
						
							
							
								
								Fast Haswell ZGEMM kernel  
							
							
							
						 
						
							2019-12-21 14:37:06 +08:00  
				
					
						
							
							
								 
						
							
								d573d24de7 
								
							 
						 
						
							
							
								
								Fast Haswell ZGEMM kernel  
							
							
							
						 
						
							2019-12-21 14:35:15 +08:00  
				
					
						
							
							
								 
						
							
								b7cc69ee62 
								
							 
						 
						
							
							
								
								declare DGEMM_BETA in KERNEL.ARMV8 rather than the generic KERNEL  
							
							
							
						 
						
							2019-12-20 10:11:50 +08:00  
				
					
						
							
							
								 
						
							
								aeef942c4f 
								
							 
						 
						
							
							
								
								use arm neon instructions to optimize gemm beta operation  
							
							
							
						 
						
							2019-12-17 10:00:13 +08:00  
				
					
						
							
							
								 
						
							
								1a6ea8ee6d 
								
							 
						 
						
							
							
								
								Merge pull request  #2338  from kavanabhat/aix_mod  
							
							... 
							
							
							
							Changes to build on AIX in POWER8 mode 
							
						 
						
							2019-12-09 17:54:49 +01:00  
				
					
						
							
							
								 
						
							
								6baa9b07d7 
								
							 
						 
						
							
							
								
								AIX changes for Power8  
							
							
							
						 
						
							2019-12-06 04:33:32 -06:00  
				
					
						
							
							
								 
						
							
								3938e59569 
								
							 
						 
						
							
							
								
								AIX changes for Power8  
							
							
							
						 
						
							2019-12-04 00:23:46 -06:00  
				
					
						
							
							
								 
						
							
								b863b32ac5 
								
							 
						 
						
							
							
								
								Workaround an ICE in clang 9.0.0  
							
							... 
							
							
							
							This bug is not there in 8.x nor in the 9.0 daily snapshot. 
							
						 
						
							2019-12-01 12:59:46 -06:00  
				
					
						
							
							
								 
						
							
								dd04143d4a 
								
							 
						 
						
							
							
								
								Merge pull request  #2328  from martin-frbg/ppc9  
							
							... 
							
							
							
							Fix precompiled kernels on POWER9 and make their use conditional on (old) gcc version 
							
						 
						
							2019-11-30 12:23:57 +01:00  
				
					
						
							
							
								 
						
							
								f3a6164bff 
								
							 
						 
						
							
							
								
								Merge pull request  #2324  from antonblanchard/power9_segv  
							
							... 
							
							
							
							Fix SEGV in cdot_power9 
							
						 
						
							2019-11-30 00:03:42 +01:00  
				
					
						
							
							
								 
						
							
								dedd822d1a 
								
							 
						 
						
							
							
								
								Fix caxpy/caxpyc naming in localentry  
							
							
							
						 
						
							2019-11-29 23:56:57 +01:00  
				
					
						
							
							
								 
						
							
								2181fb7047 
								
							 
						 
						
							
							
								
								Fix caxpy/caxpyc naming in localentry  
							
							
							
						 
						
							2019-11-29 23:54:15 +01:00  
				
					
						
							
							
								 
						
							
								a9b62c03f8 
								
							 
						 
						
							
							
								
								Substitute precompiled gcc7 codes only when gcc is older than 9.x  
							
							
							
						 
						
							2019-11-29 23:49:50 +01:00  
				
					
						
							
							
								 
						
							
								97762234f9 
								
							 
						 
						
							
							
								
								Add variable for gcc >=9 test  
							
							... 
							
							
							
							used in KERNEL.POWER9 
							
						 
						
							2019-11-29 23:47:23 +01:00  
				
					
						
							
							
								 
						
							
								934e601e93 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_skylakex_2.c  
							
							
							
						 
						
							2019-11-28 19:56:35 +08:00  
				
					
						
							
							
								 
						
							
								cf2a8e410c 
								
							 
						 
						
							
							
								
								Fix SEGV in cdot_power9  
							
							... 
							
							
							
							We were corrupting r2 because the local entry wasn't being
setup correctly. 
							
						 
						
							2019-11-26 21:55:04 -07:00  
				
					
						
							
							
								 
						
							
								eb1e9c8c92 
								
							 
						 
						
							
							
								
								some optimizations  
							
							
							
						 
						
							2019-11-26 14:12:20 +08:00  
				
					
						
							
							
								 
						
							
								d117dfd505 
								
							 
						 
						
							
							
								
								Change bad usage of "asum" to "sum" in ZARCH versions of ?sum  
							
							... 
							
							
							
							The ZARCH implementations of ?sum contain a cut & paste-error: An inline
assembly argument is named "sum", but the assembly references "asum"
instead.  The mismatch causes a build error.  This is fixed. 
							
						 
						
							2019-11-21 13:49:13 +01:00  
				
					
						
							
							
								 
						
							
								b09b5be0a4 
								
							 
						 
						
							
							
								
								Merge pull request  #2315  from ewanglong/develop  
							
							... 
							
							
							
							revised fix windows compatible for #2313  
							
						 
						
							2019-11-21 05:06:44 +01:00  
				
					
						
							
							
								 
						
							
								bfb5fbdb4d 
								
							 
						 
						
							
							
								
								revised fix windows compatible for  #2313  
							
							... 
							
							
							
							Signed-off-by: Wang, Long <long1.wang@intel.com> 
							
						 
						
							2019-11-21 10:22:58 +08:00  
				
					
						
							
							
								 
						
							
								08fa83aba2 
								
							 
						 
						
							
							
								
								Merge pull request  #2312  from martin-frbg/power8be  
							
							... 
							
							
							
							Further Power8 big-endian corrections 
							
						 
						
							2019-11-20 15:12:06 +01:00  
				
					
						
							
							
								 
						
							
								1191db1a49 
								
							 
						 
						
							
							
								
								For the sake of windows compatible, used "unsigned long long" to ensure 64-bit length  
							
							... 
							
							
							
							Signed-off-by: Wang, Long <long1.wang@intel.com> 
							
						 
						
							2019-11-20 21:30:47 +08:00  
				
					
						
							
							
								 
						
							
								0caf1434c9 
								
							 
						 
						
							
							
								
								Fix the integer overflow issue for large matrix size  
							
							... 
							
							
							
							For large matrix, e.g. M=N=K, and M>1290, int mnk=M*N*K will overflow.
This will lead to wrong branching to single-threading. The performance
is downgraded significantly.
Signed-off-by: Wang, Long <long1.wang@intel.com> 
							
						 
						
							2019-11-20 14:11:17 +08:00  
				
					
						
							
							
								 
						
							
								cad0d150db 
								
							 
						 
						
							
							
								
								Define alternate kernels for big-endian POWER8  
							
							
							
						 
						
							2019-11-17 23:12:10 +01:00  
				
					
						
							
							
								 
						
							
								eba0aeb7cd 
								
							 
						 
						
							
							
								
								Fix compilation for big-endian POWER8  
							
							
							
						 
						
							2019-11-17 22:58:32 +01:00  
				
					
						
							
							
								 
						
							
								0c07c356c1 
								
							 
						 
						
							
							
								
								Define alternate kernels for big-endian PPC440  
							
							
							
						 
						
							2019-11-17 19:25:08 +01:00  
				
					
						
							
							
								 
						
							
								3e67017ac8 
								
							 
						 
						
							
							
								
								Merge pull request  #2309  from martin-frbg/ppc970-be  
							
							... 
							
							
							
							Fix PPC970 big-endian support 
							
						 
						
							2019-11-17 18:22:24 +01:00  
				
					
						
							
							
								 
						
							
								b3ac6ee222 
								
							 
						 
						
							
							
								
								Define alternate kernels for big-endian PPC970  
							
							... 
							
							
							
							The altivec versions of SGEMM and CGEMM fail most test in LAPACK-TESTING when compiled for big endian, STRSM/CTRSM even cause segfaults. The rot kernels either fail the corresponding utest or lead to failures in LAPACK-TESTING. 
							
						 
						
							2019-11-17 15:19:39 +01:00  
				
					
						
							
							
								 
						
							
								71e96163db 
								
							 
						 
						
							
							
								
								Merge pull request  #2305  from wjc404/develop  
							
							... 
							
							
							
							AVX512 CGEMM & ZGEMM kernels 
							
						 
						
							2019-11-12 07:38:37 +01:00  
				
					
						
							
							
								 
						
							
								819e852ae7 
								
							 
						 
						
							
							
								
								AVX512 CGEMM & ZGEMM kernels  
							
							... 
							
							
							
							96-99% 1-thread performance of MKL2018 
							
						 
						
							2019-11-11 20:04:52 +08:00  
				
					
						
							
							
								 
						
							
								4c6a457358 
								
							 
						 
						
							
							
								
								Merge pull request  #2300  from wjc404/develop  
							
							... 
							
							
							
							Optimize SGEMM on SKYLAKEX CPUs 
							
						 
						
							2019-11-06 07:27:33 +01:00  
				
					
						
							
							
								 
						
							
								836c414e22 
								
							 
						 
						
							
							
								
								optimizations of software prefetching  
							
							
							
						 
						
							2019-11-05 13:36:56 +08:00  
				
					
						
							
							
								 
						
							
								3cd97f1a80 
								
							 
						 
						
							
							
								
								Merge pull request  #2301  from martin-frbg/ppc8be  
							
							... 
							
							
							
							Disable IDAMIN/MAX and IZAMIN/MAX optimizations on big-endian POWER8 
							
						 
						
							2019-11-04 22:54:28 +01:00  
				
					
						
							
							
								 
						
							
								430c11e135 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-11-04 20:10:12 +08:00  
				
					
						
							
							
								 
						
							
								fbacd2605d 
								
							 
						 
						
							
							
								
								optimizations via software prefetches  
							
							
							
						 
						
							2019-11-04 19:37:19 +08:00  
				
					
						
							
							
								 
						
							
								68597002ea 
								
							 
						 
						
							
							
								
								The assembly microkernel is not safe to use on ELFv1  
							
							
							
						 
						
							2019-11-03 22:42:46 +01:00  
				
					
						
							
							
								 
						
							
								d2a6285549 
								
							 
						 
						
							
							
								
								The assembly microkernel is not safe to use on ELFv1  
							
							
							
						 
						
							2019-11-03 22:41:19 +01:00  
				
					
						
							
							
								 
						
							
								d999688d1a 
								
							 
						 
						
							
							
								
								The assembly microkernel is not safe to use on ELFv1  
							
							
							
						 
						
							2019-11-03 22:39:06 +01:00  
				
					
						
							
							
								 
						
							
								928fe1b28e 
								
							 
						 
						
							
							
								
								The assembly microkernel is not safe to use on ELFv1  
							
							
							
						 
						
							2019-11-03 22:37:27 +01:00  
				
					
						
							
							
								 
						
							
								1df9a2013d 
								
							 
						 
						
							
							
								
								new sgemm kernel for skylakex  
							
							
							
						 
						
							2019-11-02 00:00:48 +08:00  
				
					
						
							
							
								 
						
							
								85ccdce8c4 
								
							 
						 
						
							
							
								
								Remove the IOS fallbacks to generic C kernels  
							
							
							
						 
						
							2019-10-25 23:02:37 +02:00  
				
					
						
							
							
								 
						
							
								6ff013bae0 
								
							 
						 
						
							
							
								
								native support for icopy_4  
							
							... 
							
							
							
							90% MKL 1-thread performance. 
							
						 
						
							2019-10-19 03:54:44 +08:00  
				
					
						
							
							
								 
						
							
								0d669e04bb 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_8x8_skylakex.c  
							
							
							
						 
						
							2019-10-18 15:00:17 +08:00  
				
					
						
							
							
								 
						
							
								17cdd9f9e1 
								
							 
						 
						
							
							
								
								some correction  
							
							
							
						 
						
							2019-10-18 14:58:07 +08:00  
				
					
						
							
							
								 
						
							
								6bcb06fcb1 
								
							 
						 
						
							
							
								
								make further changes to icopy_8 easier  
							
							
							
						 
						
							2019-10-18 10:47:31 +08:00  
				
					
						
							
							
								 
						
							
								b7315f8401 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-10-16 19:23:36 +08:00  
				
					
						
							
							
								 
						
							
								9b19e9e1b0 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_8x8_skylakex.c  
							
							
							
						 
						
							2019-10-16 10:14:51 +08:00  
				
					
						
							
							
								 
						
							
								6bd67ddbab 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_8x8_skylakex.c  
							
							
							
						 
						
							2019-10-16 03:20:08 +08:00  
				
					
						
							
							
								 
						
							
								844629af57 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-10-16 02:00:34 +08:00  
				
					
						
							
							
								 
						
							
								a448884a63 
								
							 
						 
						
							
							
								
								Remove automatic label postfixes from macro included only once  
							
							
							
						 
						
							2019-10-08 08:37:50 +02:00  
				
					
						
							
							
								 
						
							
								3a2df19db6 
								
							 
						 
						
							
							
								
								Fix accidental duplication of jump instruction  
							
							
							
						 
						
							2019-10-08 08:09:26 +02:00  
				
					
						
							
							
								 
						
							
								d2093a40d3 
								
							 
						 
						
							
							
								
								Merge pull request  #2277  from martin-frbg/issue2275  
							
							... 
							
							
							
							Rewrite ARMV8 code to allow cross-compilation for IOS 
							
						 
						
							2019-10-06 23:01:54 +02:00  
				
					
						
							
							
								 
						
							
								56837e9d92 
								
							 
						 
						
							
							
								
								Make local labels in macro compatible with the xcode assembler  
							
							... 
							
							
							
							... which does not perform the automatic numbering on instantiation that the _@ suffix signifies 
							
						 
						
							2019-10-04 14:53:23 +02:00  
				
					
						
							
							
								 
						
							
								5e244d80f2 
								
							 
						 
						
							
							
								
								Merge pull request  #2271  from quickwritereader/strmm_fix  
							
							... 
							
							
							
							fixed bug power9 strmm . BLAS-TESTER passes 
							
						 
						
							2019-09-29 13:53:45 +02:00  
				
					
						
							
							
								 
						
							
								ede5efebab 
								
							 
						 
						
							
							
								
								trmm fix  
							
							
							
						 
						
							2019-09-29 02:28:34 +00:00  
				
					
						
							
							
								 
						
							
								596a22325a 
								
							 
						 
						
							
							
								
								Fix prologue of power9 assembly cdot(c) kernel to provide cdotc  
							
							
							
						 
						
							2019-09-27 00:47:18 +02:00  
				
					
						
							
							
								 
						
							
								7f58f3ad0e 
								
							 
						 
						
							
							
								
								Fix mis-edits in the gcc-derived power8 caxpy kernel  
							
							
							
						 
						
							2019-09-27 00:44:26 +02:00  
				
					
						
							
							
								 
						
							
								673e5a0495 
								
							 
						 
						
							
							
								
								Replace several POWER8/9 C kernels with their gcc7-generated assembly versions ( #2263 )  
							
							... 
							
							
							
							* Add gcc7-generated assembly files for POWER8/9 isa/ica-min/max and POWER9 caxpy
To work around internal compiler errors encountered when compiling the original C source with gcc 4 and 5, and wrong code generated by gcc 8.3.0
* Use gcc-generated assembly instead of original C sources
to work around internal compiler errors encountered with gcc 4.8/5.4 and wrong code generation by gcc 8.3
* Use gcc-generated assembly instead of the original C source
to work around internal compiler errors encountered with gcc 4.8 and 5.4, and wrong code generation by gcc 8.3
* Add gcc7-generated assembler version of caxpy for power8
to work around wrong code generated by gcc 8.3
* Handle CONJ define for caxpyc
* Handle CONJ define for caxpyc
* Add gcc7-generated assembly cdot for POWER9
* Use prebuilt assembly for POWER9 cdot
created with gcc 7.3.1 to work around ICE in older gcc versions
* Exclude POWER9 from DYNAMIC_ARCH when gcc versions is lower than 6
* Update Makefile.system
* Use PROLOGUE macro to ensure correct function name for DYNAMIC_ARCH
* Disable POWER9 with old gcc versions 
							
						 
						
							2019-09-22 22:35:22 +02:00  
				
					
						
							
							
								 
						
							
								e7c4d6705a 
								
							 
						 
						
							
							
								
								Revert  #2051  and replace with a better fix ( #2261 )  
							
							... 
							
							
							
							* Revert #2051  and add a better fix for TARGET=generic with DYNAMIC_ARCH
fixes  #2257  without breaking #2048  again 
							
						 
						
							2019-09-17 18:56:04 +02:00  
				
					
						
							
							
								 
						
							
								f3c314550c 
								
							 
						 
						
							
							
								
								Merge pull request  #2243  from quickwritereader/develop  
							
							... 
							
							
							
							possible cgemv,caxpy,cdot fix 
							
						 
						
							2019-08-30 23:06:23 +02:00  
				
					
						
							
							
								 
						
							
								847c20c9b7 
								
							 
						 
						
							
							
								
								fix uninitialized variables i  
							
							
							
						 
						
							2019-08-30 11:14:55 +00:00  
				
					
						
							
							
								 
						
							
								4c22828812 
								
							 
						 
						
							
							
								
								caxpy and cdot are using vec_vsx_ld  
							
							
							
						 
						
							2019-08-30 04:09:15 +00:00  
				
					
						
							
							
								 
						
							
								e79712d969 
								
							 
						 
						
							
							
								
								cgemv using vec_vsx_ld instead of letting gcc to decide  
							
							
							
						 
						
							2019-08-30 02:52:04 +00:00  
				
					
						
							
							
								 
						
							
								be09551cdf 
								
							 
						 
						
							
							
								
								aligned  
							
							
							
						 
						
							2019-08-29 23:22:23 +00:00  
				
					
						
							
							
								 
						
							
								11c59acfb1 
								
							 
						 
						
							
							
								
								Keep both PGI/SUN and default code paths to avoid breaking Clang/WIndows  
							
							
							
						 
						
							2019-08-28 18:07:44 +02:00  
				
					
						
							
							
								 
						
							
								3a55dca2dc 
								
							 
						 
						
							
							
								
								Make x86_64 zdot compile with PGI and Sun C again  
							
							... 
							
							
							
							broken by #2222  as CREAL,CIMAG do not expand to a valid lvalue with these compilers 
							
						 
						
							2019-08-28 11:35:31 +02:00  
				
					
						
							
							
								 
						
							
								3dc6b26eff 
								
							 
						 
						
							
							
								
								AIX changes for Power8  
							
							
							
						 
						
							2019-08-20 06:51:35 -05:00  
				
					
						
							
							
								 
						
							
								9ef96b32a6 
								
							 
						 
						
							
							
								
								Add multithreading support to the x86_64 zdot kernel ( #2222 )  
							
							... 
							
							
							
							* Add multithreading support
copied from the ThunderX2T99 kernel. For #2221  
							
						 
						
							2019-08-15 22:09:12 +02:00  
				
					
						
							
							
								 
						
							
								103b32fdb7 
								
							 
						 
						
							
							
								
								Merge pull request  #2216  from martin-frbg/issue2214  
							
							... 
							
							
							
							Remove case-sensitivity in x86 LSAME on (AMD) cpus without CMOV 
							
						 
						
							2019-08-13 13:59:33 +02:00  
				
					
						
							
							
								 
						
							
								aef9804089 
								
							 
						 
						
							
							
								
								Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV  
							
							... 
							
							
							
							Problem was already noticed some years ago in #238 , but back then the problem was only corrected in one of the #ifdef branches.
Fixes  #2214  
							
						 
						
							2019-08-13 10:19:10 +02:00  
				
					
						
							
							
								 
						
							
								dccff2e785 
								
							 
						 
						
							
							
								
								Merge pull request  #2206  from martin-frbg/zen-dtrmm  
							
							... 
							
							
							
							Replace vpermpd with vpermilpd in the Haswell DTRMM kernel 
							
						 
						
							2019-08-09 07:55:20 +02:00  
				
					
						
							
							
								 
						
							
								5c3458a6e7 
								
							 
						 
						
							
							
								
								Merge pull request  #2199  from martin-frbg/zen-dtrsm  
							
							... 
							
							
							
							Replace most vpermpd calls in the Haswell DTRSM_RN kernel 
							
						 
						
							2019-08-09 07:55:02 +02:00  
				
					
						
							
							
								 
						
							
								acf6002ab2 
								
							 
						 
						
							
							
								
								Replace most vpermpd calls in the Haswell DTRSM_RN kernel  
							
							
							
						 
						
							2019-08-03 12:40:13 +02:00  
				
					
						
							
							
								 
						
							
								2dfb804cb9 
								
							 
						 
						
							
							
								
								Replace vpermpd with vpermilpd in the Haswell DTRMM kernel  
							
							... 
							
							
							
							to improve performance on AMD Zen (#2180 ) applying wjc404's improvement of the DGEMM kernel from #2186  
							
						 
						
							2019-07-28 23:17:28 +02:00  
				
					
						
							
							
								 
						
							
								4c153ec9da 
								
							 
						 
						
							
							
								
								Merge pull request  #2196  from wjc404/develop  
							
							... 
							
							
							
							Add vbroadcastsd kernel to dgemm_kernel_4x8_haswell.S 
							
						 
						
							2019-07-28 23:11:40 +02:00  
				
					
						
							
							
								 
						
							
								7eecd8e39c 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-07-28 07:39:09 +08:00  
				
					
						
							
							
								 
						
							
								7b0b7c11d2 
								
							 
						 
						
							
							
								
								Merge pull request  #2190  from martin-frbg/zdot-zen  
							
							... 
							
							
							
							Replace vpermpd with vpermilpd in the Haswell/Zen zdot microkernel 
							
						 
						
							2019-07-23 16:15:08 +02:00  
				
					
						
							
							
								 
						
							
								28e96458e5 
								
							 
						 
						
							
							
								
								Replace vpermpd with vpermilpd  
							
							... 
							
							
							
							to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180 ) 
							
						 
						
							2019-07-22 08:28:16 +02:00  
				
					
						
							
							
								 
						
							
								95fb98f556 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-21 01:10:32 +08:00  
				
					
						
							
							
								 
						
							
								4801c6d36b 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-21 00:47:45 +08:00  
				
					
						
							
							
								 
						
							
								9440fa607d 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-07-20 22:08:22 +08:00  
				
					
						
							
							
								 
						
							
								94db259e5b 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-07-20 22:04:41 +08:00  
				
					
						
							
							
								 
						
							
								f49f8047ac 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-07-20 14:33:37 +08:00  
				
					
						
							
							
								 
						
							
								825777faab 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-19 23:58:24 +08:00  
				
					
						
							
							
								 
						
							
								9c89757562 
								
							 
						 
						
							
							
								
								Add files via upload  
							
							
							
						 
						
							2019-07-19 23:47:58 +08:00  
				
					
						
							
							
								 
						
							
								9b04baeaee 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-17 23:50:03 +08:00  
				
					
						
							
							
								 
						
							
								8a074b3965 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-17 23:47:30 +08:00  
				
					
						
							
							
								 
						
							
								211ab03b14 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-17 22:39:15 +08:00  
				
					
						
							
							
								 
						
							
								1733f927e6 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-17 21:27:41 +08:00  
				
					
						
							
							
								 
						
							
								182b06d6ad 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-17 17:02:35 +08:00  
				
					
						
							
							
								 
						
							
								7a9050d681 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S  
							
							
							
						 
						
							2019-07-17 00:55:06 +08:00  
				
					
						
							
							
								 
						
							
								0ba29fd262 
								
							 
						 
						
							
							
								
								Update dgemm_kernel_4x8_haswell.S for zen2  
							
							... 
							
							
							
							replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128 
							
						 
						
							2019-07-17 00:46:51 +08:00  
				
					
						
							
							
								 
						
							
								6b6c9b1441 
								
							 
						 
						
							
							
								
								Merge pull request  #2172  from quickwritereader/develop  
							
							... 
							
							
							
							power9 cgemm/ctrmm. new sgemm 8x16 
							
						 
						
							2019-07-01 21:06:02 +02:00  
				
					
						
							
							
								 
						
							
								a97b301aaa 
								
							 
						 
						
							
							
								
								cgemm/ctrmm power9  
							
							
							
						 
						
							2019-07-01 14:07:54 +00:00  
				
					
						
							
							
								 
						
							
								eebfeba768 
								
							 
						 
						
							
							
								
								Fix build on FreeBSD/powerpc64.  
							
							... 
							
							
							
							Signed-off-by: Piotr Kubaj <pkubaj@anongoth.pl> 
							
						 
						
							2019-06-25 10:58:56 +02:00  
				
					
						
							
							
								 
						
							
								a575f1e4c7 
								
							 
						 
						
							
							
								
								Update dtrmm_kernel_16x4_power8.S  
							
							
							
						 
						
							2019-06-19 15:27:14 +05:30  
				
					
						
							
							
								 
						
							
								cdbfb891da 
								
							 
						 
						
							
							
								
								new sgemm 8x16  
							
							
							
						 
						
							2019-06-17 15:33:38 +00:00  
				
					
						
							
							
								 
						
							
								a17cf36225 
								
							 
						 
						
							
							
								
								Merge pull request  #2153  from quickwritereader/develop  
							
							... 
							
							
							
							improved power9 zgemm,sgemm 
							
						 
						
							2019-06-06 07:42:56 +02:00  
				
					
						
							
							
								 
						
							
								148c4cc5fd 
								
							 
						 
						
							
							
								
								conflict resolve  
							
							
							
						 
						
							2019-06-05 20:50:50 +00:00  
				
					
						
							
							
								 
						
							
								d0c3543c3f 
								
							 
						 
						
							
							
								
								power9 zgemm ztrmm optimized  
							
							
							
						 
						
							2019-06-05 20:07:16 +00:00  
				
					
						
							
							
								 
						
							
								a469b32cf4 
								
							 
						 
						
							
							
								
								sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52  
							
							
							
						 
						
							2019-06-04 07:11:30 +00:00  
				
					
						
							
							
								 
						
							
								8fe794f059 
								
							 
						 
						
							
							
								
								improved zgemm power9 based on power8  
							
							
							
						 
						
							2019-05-30 15:31:25 +00:00  
				
					
						
							
							
								 
						
							
								74c10b57c6 
								
							 
						 
						
							
							
								
								Use generic kernels for complex (I)AMAX to support softfp  
							
							
							
						 
						
							2019-05-30 11:38:11 +02:00  
				
					
						
							
							
								 
						
							
								c5495d2056 
								
							 
						 
						
							
							
								
								Ensure correct output for DAMAX with softfp  
							
							
							
						 
						
							2019-05-30 11:25:43 +02:00  
				
					
						
							
							
								 
						
							
								c70496b108 
								
							 
						 
						
							
							
								
								Separate implementations of AMAX and IAMAX on arm  
							
							... 
							
							
							
							As noted in #1912  and comment on #1942 , the combined implementation happens to "do the right thing" on hardfp, but cannot return both value and index on softfp where they would have to share the return register 
							
						 
						
							2019-05-29 15:02:51 +02:00  
				
					
						
							
							
								 
						
							
								9ea30f3788 
								
							 
						 
						
							
							
								
								Replace ISMIN and ISAMIN kernels on all x86_64 platforms ( #2125 )  
							
							... 
							
							
							
							* Mark iamax_sse.S as unsuitable for MIN due to issue #2116 
* Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116  
							
						 
						
							2019-05-09 14:42:36 +02:00  
				
					
						
							
							
								 
						
							
								6a8b4269b5 
								
							 
						 
						
							
							
								
								Merge pull request  #2111  from martin-frbg/issue1955  
							
							... 
							
							
							
							Disable the SkyLakeX DGEMMIxCOPY kernels as well 
							
						 
						
							2019-05-05 18:08:49 +02:00  
				
					
						
							
							
								 
						
							
								b1561ecc68 
								
							 
						 
						
							
							
								
								Disable DGEMMINCOPY as well for now  
							
							... 
							
							
							
							#1955  
						
							2019-05-05 15:52:01 +02:00  
				
					
						
							
							
								 
						
							
								7ed8431527 
								
							 
						 
						
							
							
								
								Disable the SkyLakeX DGEMMITCOPY kernel as well  
							
							... 
							
							
							
							as a stopgap measure for https://github.com/numpy/numpy/issues/13401  as mentioned in #1955  
							
						 
						
							2019-05-04 22:54:41 +02:00  
				
					
						
							
							
								 
						
							
								3f427c0cf9 
								
							 
						 
						
							
							
								
								Merge pull request  #2107  from quickwritereader/develop  
							
							... 
							
							
							
							sgemm/strmm kernel for power9 
							
						 
						
							2019-05-02 07:56:57 +02:00  
				
					
						
							
							
								 
						
							
								47f892198c 
								
							 
						 
						
							
							
								
								conflict resolve  
							
							
							
						 
						
							2019-05-01 19:36:22 +00:00  
				
					
						
							
							
								 
						
							
								628b335e83 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of  https://github.com/quickwritereader/OpenBLAS  into develop  
							
							
							
						 
						
							2019-04-29 08:57:44 +00:00  
				
					
						
							
							
								 
						
							
								0f105dd8a5 
								
							 
						 
						
							
							
								
								sgemm/strmm  
							
							
							
						 
						
							2019-04-29 08:49:50 +00:00  
				
					
						
							
							
								 
						
							
								ccfb7ead15 
								
							 
						 
						
							
							
								
								Merge pull request  #2072  from martin-frbg/sum  
							
							... 
							
							
							
							Add (C)BLAS extension ?sum 
							
						 
						
							2019-04-23 20:11:36 +02:00  
				
					
						
							
							
								 
						
							
								bcdf1d4917 
								
							 
						 
						
							
							
								
								Add in runtime CPU detection for POWER.  
							
							
							
						 
						
							2019-04-09 14:20:16 +10:00  
				
					
						
							
							
								 
						
							
								c04a729081 
								
							 
						 
						
							
							
								
								Add ?sum definitions for generic kernel  
							
							
							
						 
						
							2019-03-31 13:55:49 +02:00  
				
					
						
							
							
								 
						
							
								100d94f94e 
								
							 
						 
						
							
							
								
								Add ?sum  
							
							
							
						 
						
							2019-03-31 13:55:05 +02:00  
				
					
						
							
							
								 
						
							
								246ca29679 
								
							 
						 
						
							
							
								
								Add ZARCH implementation of ?sum  
							
							... 
							
							
							
							as trivial copies of the respective ?asum kernels with the ABS and vflpsb calls removed 
							
						 
						
							2019-03-30 22:49:05 +01:00  
				
					
						
							
							
								 
						
							
								9d717cb5ee 
								
							 
						 
						
							
							
								
								Add x86_64 implementation of ?sum  
							
							... 
							
							
							
							as trivial copy of ?asum with the fabs calls removed 
							
						 
						
							2019-03-30 22:27:04 +01:00  
				
					
						
							
							
								 
						
							
								e3bc83f2a8 
								
							 
						 
						
							
							
								
								Add x86 implementation of ?sum  
							
							... 
							
							
							
							as trivial copy of ?asum with the fabs calls removed 
							
						 
						
							2019-03-30 22:26:10 +01:00  
				
					
						
							
							
								 
						
							
								70f2a4e0d7 
								
							 
						 
						
							
							
								
								Add SPARC implementation of ?sum  
							
							... 
							
							
							
							as trivial copy of ?asum with the fabs replaced by fmov to preserve code structure 
							
						 
						
							2019-03-30 22:25:06 +01:00  
				
					
						
							
							
								 
						
							
								706dfe263b 
								
							 
						 
						
							
							
								
								Add POWER implementation of ?sum  
							
							... 
							
							
							
							as trivial copy of ?asum with the fabs replaced by fmr to preserve code structure 
							
						 
						
							2019-03-30 22:23:42 +01:00  
				
					
						
							
							
								 
						
							
								688fa9201c 
								
							 
						 
						
							
							
								
								Add MIPS64 implementation of ?sum  
							
							... 
							
							
							
							as trivial copy of ?asum with the fabs replaced by mov to preserve code structure 
							
						 
						
							2019-03-30 22:22:15 +01:00  
				
					
						
							
							
								 
						
							
								cdbe0f0235 
								
							 
						 
						
							
							
								
								Add MIPS implementation of ?sum  
							
							... 
							
							
							
							as trivial copy of ?asum with the fabs calls removed 
							
						 
						
							2019-03-30 22:20:14 +01:00  
				
					
						
							
							
								 
						
							
								f8b82bc6dc 
								
							 
						 
						
							
							
								
								Add ia64 implementation of ?sum  
							
							... 
							
							
							
							as trivial copy of asum with the fabs calls removed 
							
						 
						
							2019-03-30 22:18:03 +01:00  
				
					
						
							
							
								 
						
							
								3e3ccb9011 
								
							 
						 
						
							
							
								
								Add ARM64 implementations of ?sum  
							
							... 
							
							
							
							as trivial copies of the respective ?asum kernels with the fabs calls removed 
							
						 
						
							2019-03-30 22:13:36 +01:00  
				
					
						
							
							
								 
						
							
								94ab4e6fb2 
								
							 
						 
						
							
							
								
								Add ARM implementations of ?sum  
							
							... 
							
							
							
							(trivial copies of the respective ?asum with the fabs calls removed) 
							
						 
						
							2019-03-30 22:11:38 +01:00  
				
					
						
							
							
								 
						
							
								c3cfc6986b 
								
							 
						 
						
							
							
								
								Add implementations of ssum/dsum and csum/zsum  
							
							... 
							
							
							
							as trivial copies of asum/zsasum with the fabs calls replaced by fmov to preserve code structure 
							
						 
						
							2019-03-30 22:05:11 +01:00  
				
					
						
							
							
								 
						
							
								b9f4943a14 
								
							 
						 
						
							
							
								
								Add ?sum  
							
							
							
						 
						
							2019-03-30 22:01:13 +01:00  
				
					
						
							
							
								 
						
							
								32c7063cb0 
								
							 
						 
						
							
							
								
								Merge pull request  #2061  from martin-frbg/martin-frbg-patch-1  
							
							... 
							
							
							
							Disable the AVX512 DGEMM kernel (again) 
							
						 
						
							2019-03-30 21:21:38 +01:00  
				
					
						
							
							
								 
						
							
								7c51cc8527 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into develop  
							
							
							
						 
						
							2019-03-29 19:36:29 +01:00  
				
					
						
							
							
								 
						
							
								853a18bc17 
								
							 
						 
						
							
							
								
								power9 makefile. dgemm based on power8 kernel with following changes : 32x unrolled 16x4 kernel and 8x4 kernel using (lxv stxv butterfly rank1 update). improvement from 17 to 22-23gflops. dtrmm cases were added into dgemm itself  
							
							
							
						 
						
							2019-03-29 15:49:40 +00:00  
				
					
						
							
							
								 
						
							
								e608d4f7fe 
								
							 
						 
						
							
							
								
								Disable the AVX512 DGEMM kernel (again)  
							
							... 
							
							
							
							Due to as yet unresolved errors seen in #1955  and #2029  
							
						 
						
							2019-03-13 22:10:28 +01:00  
				
					
						
							
							
								 
						
							
								03d7110900 
								
							 
						 
						
							
							
								
								Merge pull request  #2042  from maomao194313/develop  
							
							... 
							
							
							
							add TARGET support for HiSilicon tsv110 CPUs 
							
						 
						
							2019-03-12 22:57:39 +01:00  
				
					
						
							
							
								 
						
							
								f18ab6c17b 
								
							 
						 
						
							
							
								
								Merge pull request  #2051  from martin-frbg/issue2048  
							
							... 
							
							
							
							Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1 
							
						 
						
							2019-03-09 16:39:35 +01:00  
				
					
						
							
							
								 
						
							
								5b95534afc 
								
							 
						 
						
							
							
								
								Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1  
							
							... 
							
							
							
							for issue #2048  
							
						 
						
							2019-03-09 11:21:16 +01:00  
				
					
						
							
							
								 
						
							
								b7f59da42d 
								
							 
						 
						
							
							
								
								Fix crash in sgemm SSE/nano kernel on x86_64  
							
							... 
							
							
							
							Fix bug #2047 .
Signed-off-by: Celelibi <celelibi@gmail.com> 
							
						 
						
							2019-03-07 16:55:13 +01:00  
				
					
						
							
							
								 
						
							
								783ba8058f 
								
							 
						 
						
							
							
								
								HiSilicon tsv110 CPUs optimization branch  
							
							... 
							
							
							
							add HiSilicon tsv110 CPUs  optimization branch 
							
						 
						
							2019-03-04 16:30:50 +08:00  
				
					
						
							
							
								 
						
							
								6eee1beac5 
								
							 
						 
						
							
							
								
								move fix to right place  
							
							
							
						 
						
							2019-02-24 20:41:02 +02:00  
				
					
						
							
							
								 
						
							
								e12cdf58ef 
								
							 
						 
						
							
							
								
								Merge pull request  #2024  from martin-frbg/gcc9fixes4  
							
							... 
							
							
							
							Fix inline assembly constraints in Bulldozer TRSM kernels 
							
						 
						
							2019-02-17 11:49:15 +01:00  
				
					
						
							
							
								 
						
							
								1860c9456d 
								
							 
						 
						
							
							
								
								Merge pull request  #2023  from martin-frbg/gcc9fixes3  
							
							... 
							
							
							
							Fix inline assembly constraints in various x86_64 GEMVN kernels 
							
						 
						
							2019-02-17 11:48:57 +01:00  
				
					
						
							
							
								 
						
							
								f9bb76d29a 
								
							 
						 
						
							
							
								
								Fix inline assembly constraints in Bulldozer TRSM kernels  
							
							... 
							
							
							
							rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009  
							
						 
						
							2019-02-16 20:06:48 +01:00  
				
					
						
							
							
								 
						
							
								efb9038f72 
								
							 
						 
						
							
							
								
								Fix inline assembly constraints  
							
							
							
						 
						
							2019-02-16 18:46:17 +01:00  
				
					
						
							
							
								 
						
							
								e976557d29 
								
							 
						 
						
							
							
								
								Fix inline assembly constraints  
							
							... 
							
							
							
							rework indices to allow marking argument lda as input and output. 
							
						 
						
							2019-02-16 18:36:39 +01:00  
				
					
						
							
							
								 
						
							
								9d8be15789 
								
							 
						 
						
							
							
								
								Fix inline assembly constraints  
							
							... 
							
							
							
							rework indices to allow marking argument lda4 as input and output. For #2009  
							
						 
						
							2019-02-16 18:24:11 +01:00  
				
					
						
							
							
								 
						
							
								d752799a0f 
								
							 
						 
						
							
							
								
								Merge pull request  #2021  from martin-frbg/gcc9fixes2  
							
							... 
							
							
							
							Fix wrong constraints in inline assembly of Haswell DTRSM kernel 
							
						 
						
							2019-02-16 18:05:40 +01:00  
				
					
						
							
							
								 
						
							
								c26c0b77a7 
								
							 
						 
						
							
							
								
								Fix wrong constraints in inline assembly  
							
							... 
							
							
							
							for #2009  
							
						 
						
							2019-02-15 15:08:16 +01:00  
				
					
						
							
							
								 
						
							
								1c6da2d03c 
								
							 
						 
						
							
							
								
								Merge pull request  #2019  from martin-frbg/gcc9fixes  
							
							... 
							
							
							
							Fix unannounced modification of input operand 8 (lda4) in Haswell GEMVN microkernel 
							
						 
						
							2019-02-15 15:02:54 +01:00  
				
					
						
							
							
								 
						
							
								4255a58cd2 
								
							 
						 
						
							
							
								
								Rename operands to put lda on the input/output constraint list  
							
							
							
						 
						
							2019-02-15 10:10:04 +01:00  
				
					
						
							
							
								 
						
							
								46e415b140 
								
							 
						 
						
							
							
								
								Save and restore input argument 8 (lda4)  
							
							... 
							
							
							
							Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009 ) 
							
						 
						
							2019-02-14 22:43:18 +01:00  
				
					
						
							
							
								 
						
							
								69a97ca7b9 
								
							 
						 
						
							
							
								
								dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3  
							
							... 
							
							
							
							This fixes a crash in dblat2 when OpenBLAS is compiled using
-march=znver1 -ftree-vectorize -O2
See also:
https://github.com/easybuilders/easybuild-easyconfigs/issues/7180  
							
						 
						
							2019-02-14 16:27:58 +00:00  
				
					
						
							
							
								 
						
							
								056917d616 
								
							 
						 
						
							
							
								
								Merge pull request  #2013  from martin-frbg/issue2011  
							
							... 
							
							
							
							Fix invalid memory access in PPC gemm_beta 
							
						 
						
							2019-02-14 09:29:34 +01:00  
				
					
						
							
							
								 
						
							
								718efcec6f 
								
							 
						 
						
							
							
								
								Fix out-of-bounds memory access in gemm_beta  
							
							... 
							
							
							
							Fixes  #2011  (as suggested by davemq), assuming typo by K.Goto 
						
							2019-02-13 22:08:37 +01:00  
				
					
						
							
							
								 
						
							
								f9d67bb5e8 
								
							 
						 
						
							
							
								
								Fix out-of-bounds memory access in gemm_beta  
							
							... 
							
							
							
							Fixes  #2011  (as suggested by davemq) presuming typo by K.Goto 
						
							2019-02-13 22:06:41 +01:00  
				
					
						
							
							
								 
						
							
								76bb74fcd4 
								
							 
						 
						
							
							
								
								Merge pull request  #2012  from maamountki/z14  
							
							... 
							
							
							
							[ZARCH] Many improvements 
							
						 
						
							2019-02-13 20:15:56 +01:00  
				
					
						
							
							
								 
						
							
								0a54c98b9d 
								
							 
						 
						
							
							
								
								[ZARCH] Modify constraints  
							
							
							
						 
						
							2019-02-13 21:06:25 +02:00  
				
					
						
							
							
								 
						
							
								bec54ae366 
								
							 
						 
						
							
							
								
								[ZARCH] Fix caxpy  
							
							
							
						 
						
							2019-02-13 12:54:35 +02:00  
				
					
						
							
							
								 
						
							
								ab1630f9fa 
								
							 
						 
						
							
							
								
								Fix declaration of arguments in inline assembly  
							
							... 
							
							
							
							Argument 0 is modified so should be input and output 
							
						 
						
							2019-02-12 16:14:02 +01:00  
				
					
						
							
							
								 
						
							
								b824fa70eb 
								
							 
						 
						
							
							
								
								Fix declaration of assembly arguments in SSYMV and DSYMV microkernels  
							
							... 
							
							
							
							Arguments 0 and 1 are both input and output 
							
						 
						
							2019-02-12 16:00:18 +01:00  
				
					
						
							
							
								 
						
							
								91481a3e4e 
								
							 
						 
						
							
							
								
								Fix declaration of input arguments in inline assembly  
							
							... 
							
							
							
							Argument 0 is modified as it doubles as a counter 
							
						 
						
							2019-02-12 15:51:43 +01:00  
				
					
						
							
							
								 
						
							
								dc6ac9eab0 
								
							 
						 
						
							
							
								
								Fix declaration of input arguments in the x86_64 s/dGEMV_T and s/dGEMV_N kernels  
							
							... 
							
							
							
							Arguments 0 and 1 need to be tagged as both input and output 
							
						 
						
							2019-02-12 15:33:48 +01:00  
				
					
						
							
							
								 
						
							
								f583674109 
								
							 
						 
						
							
							
								
								[ZARCH] Fix cgemv_t_4  
							
							
							
						 
						
							2019-02-12 13:12:28 +02:00  
				
					
						
							
							
								 
						
							
								77fe70019f 
								
							 
						 
						
							
							
								
								[ZARCH] Fix constraints and source code formatting  
							
							
							
						 
						
							2019-02-11 16:01:13 +02:00  
				
					
						
							
							
								 
						
							
								7039770165 
								
							 
						 
						
							
							
								
								[ZARCH] Undo the last commit  
							
							
							
						 
						
							2019-02-06 20:11:44 +02:00  
				
					
						
							
							
								 
						
							
								11a43e8116 
								
							 
						 
						
							
							
								
								[ZARCH] Set alignment hint for vl/vst  
							
							
							
						 
						
							2019-02-05 19:17:08 +02:00  
				
					
						
							
							
								 
						
							
								61526480f9 
								
							 
						 
						
							
							
								
								[ZARCH] Fix copy constraint  
							
							
							
						 
						
							2019-02-05 07:51:19 +02:00  
				
					
						
							
							
								 
						
							
								81daf6bc38 
								
							 
						 
						
							
							
								
								[ZARCH] Format source code, Fix constraints  
							
							
							
						 
						
							2019-02-05 07:30:38 +02:00  
				
					
						
							
							
								 
						
							
								729e925174 
								
							 
						 
						
							
							
								
								Merge pull request  #1996  from quickwritereader/develop  
							
							... 
							
							
							
							NBMAX=4096 for gemvn, added sgemvn 8x8 for future 
							
						 
						
							2019-02-04 16:52:04 +01:00  
				
					
						
							
							
								 
						
							
								498ac98581 
								
							 
						 
						
							
							
								
								Note for unused kernels  
							
							
							
						 
						
							2019-02-04 15:41:56 +00:00  
				
					
						
							
							
								 
						
							
								cd9ea45463 
								
							 
						 
						
							
							
								
								NBMAX=4096 for gemvn, added sgemvn 8x8 for future  
							
							
							
						 
						
							2019-02-04 06:57:11 +00:00  
				
					
						
							
							
								 
						
							
								f9c5023e04 
								
							 
						 
						
							
							
								
								Merge pull request  #1994  from quickwritereader/develop  
							
							... 
							
							
							
							sgemv cgemv pairs 
							
						 
						
							2019-02-01 21:04:47 +01:00  
				
					
						
							
							
								 
						
							
								4abc375a91 
								
							 
						 
						
							
							
								
								sgemv cgemv pairs  
							
							
							
						 
						
							2019-02-01 13:45:00 +00:00  
				
					
						
							
							
								 
						
							
								874df65491 
								
							 
						 
						
							
							
								
								Fix incorrect sgemv results for IBM z14  
							
							... 
							
							
							
							part of PR #1993  that was inadvertently misplaced into the toplevel directory 
							
						 
						
							2019-02-01 12:58:59 +01:00  
				
					
						
							
							
								 
						
							
								877023e1e1 
								
							 
						 
						
							
							
								
								Fix precision of zarch DSDOT  
							
							... 
							
							
							
							from patch provided by aarnez in #991  
							
						 
						
							2019-01-31 21:22:26 +01:00  
				
					
						
							
							
								 
						
							
								265142edd5 
								
							 
						 
						
							
							
								
								Fix typo in the zarch min/max kernels  
							
							... 
							
							
							
							from patch provided by aarnez in #991  
							
						 
						
							2019-01-31 21:21:40 +01:00  
				
					
						
							
							
								 
						
							
								885a3c4350 
								
							 
						 
						
							
							
								
								USE_TRMM on Z14  
							
							... 
							
							
							
							from patch provided by aarnez in #991  
							
						 
						
							2019-01-31 21:18:09 +01:00  
				
					
						
							
							
								 
						
							
								82124729af 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into z14  
							
							
							
						 
						
							2019-01-31 19:36:41 +02:00  
				
					
						
							
							
								 
						
							
								29416cb5a3 
								
							 
						 
						
							
							
								
								[ZARCH] Add Z13 version for max/min functions  
							
							
							
						 
						
							2019-01-31 19:11:11 +02:00  
				
					
						
							
							
								 
						
							
								48b9b94f7f 
								
							 
						 
						
							
							
								
								[ZARCH] Improve loading performance for camax/icamax  
							
							
							
						 
						
							2019-01-31 18:52:11 +02:00  
				
					
						
							
							
								 
						
							
								86a824c97f 
								
							 
						 
						
							
							
								
								Fix wrong comparison that made IMIN identical to IMAX  
							
							... 
							
							
							
							as reported by aarnez in #1990  
							
						 
						
							2019-01-31 15:27:21 +01:00  
				
					
						
							
							
								 
						
							
								808410c2c7 
								
							 
						 
						
							
							
								
								Fix wrong comparison that made IMIN identical to IMAX  
							
							... 
							
							
							
							as suggested in #1990  
							
						 
						
							2019-01-31 15:25:15 +01:00  
				
					
						
							
							
								 
						
							
								fcd814a8d2 
								
							 
						 
						
							
							
								
								[ZARCH] Fix bug in max/min functions  
							
							
							
						 
						
							2019-01-29 17:59:38 +02:00  
				
					
						
							
							
								 
						
							
								dc4d3bccd5 
								
							 
						 
						
							
							
								
								[ZARCH] Fix icamax/icamin  
							
							
							
						 
						
							2019-01-29 03:47:49 +02:00  
				
					
						
							
							
								 
						
							
								c7143c1019 
								
							 
						 
						
							
							
								
								[ZARCH] Fix iamax/imax single precision  
							
							
							
						 
						
							2019-01-28 17:52:23 +02:00  
				
					
						
							
							
								 
						
							
								04873bb174 
								
							 
						 
						
							
							
								
								[ZARCH] Undo the last commit  
							
							
							
						 
						
							2019-01-28 17:32:24 +02:00  
				
					
						
							
							
								 
						
							
								c8ef9fb220 
								
							 
						 
						
							
							
								
								[ZARCH] Fix bug in iamax/iamin/imax/imin  
							
							
							
						 
						
							2019-01-28 17:16:18 +02:00  
				
					
						
							
							
								 
						
							
								b111829226 
								
							 
						 
						
							
							
								
								[ZARCH] Update max/min functions  
							
							
							
						 
						
							2019-01-21 15:56:04 +02:00  
				
					
						
							
							
								 
						
							
								32b0f1168e 
								
							 
						 
						
							
							
								
								Fix declaration of input arguments in the Sandybridge GER microkernels ( #1967 )  
							
							... 
							
							
							
							* Tag arguments 0 and 1 as both input and output 
							
						 
						
							2019-01-18 08:11:39 +01:00  
				
					
						
							
							
								 
						
							
								b495e54310 
								
							 
						 
						
							
							
								
								Fix declaration of input arguments in the x86_64 SCAL microkernels ( #1966 )  
							
							... 
							
							
							
							* Tag arguments 0 and 1 as both input and output (see #1964 ) 
							
						 
						
							2019-01-18 08:11:07 +01:00  
				
					
						
							
							
								 
						
							
								d5e6940253 
								
							 
						 
						
							
							
								
								Fix declaration of input arguments in the x86_64 microkernels for DOT and AXPY ( #1965 )  
							
							... 
							
							
							
							* Tag operands 0 and 1 as both input and output
For #1964  (basically a continuation of coding problems first seen in #1292 ) 
							
						 
						
							2019-01-17 23:20:32 +01:00  
				
					
						
							
							
								 
						
							
								43a4572038 
								
							 
						 
						
							
							
								
								crot fix  
							
							
							
						 
						
							2019-01-17 14:45:31 +00:00  
				
					
						
							
							
								 
						
							
								a034e65512 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into develop  
							
							
							
						 
						
							2019-01-16 19:25:13 +04:00  
				
					
						
							
							
								 
						
							
								8c3386be87 
								
							 
						 
						
							
							
								
								Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin},  
							
							... 
							
							
							
							Fixed idamin,icamin choosing the first occurance index of equal minimals 
							
						 
						
							2019-01-16 15:16:21 +00:00  
				
					
						
							
							
								 
						
							
								b815a04c87 
								
							 
						 
						
							
							
								
								[ZARCH] fix a bug in max/min functions  
							
							
							
						 
						
							2019-01-15 21:04:22 +02:00  
				
					
						
							
							
								 
						
							
								1a7925b3a3 
								
							 
						 
						
							
							
								
								[ZARCH] Update dgemv_n_4.c  
							
							
							
						 
						
							2019-01-11 17:43:11 +02:00  
				
					
						
							
							
								 
						
							
								406f835f00 
								
							 
						 
						
							
							
								
								[ZARCH] update cgemv_n_4.c  
							
							
							
						 
						
							2019-01-11 17:39:17 +02:00  
				
					
						
							
							
								 
						
							
								621dedb37b 
								
							 
						 
						
							
							
								
								[ZARCH] Update cgemv_t_4.c  
							
							
							
						 
						
							2019-01-11 17:37:11 +02:00  
				
					
						
							
							
								 
						
							
								b731e8246f 
								
							 
						 
						
							
							
								
								Update sgemv_t_4.c  
							
							
							
						 
						
							2019-01-11 17:14:04 +02:00  
				
					
						
							
							
								 
						
							
								ecc31b743f 
								
							 
						 
						
							
							
								
								Update dgemv_t_4.c  
							
							
							
						 
						
							2019-01-11 17:13:02 +02:00  
				
					
						
							
							
								 
						
							
								5d89d6b143 
								
							 
						 
						
							
							
								
								[ZARCH] fix sgemv_n_4.c  
							
							
							
						 
						
							2019-01-11 17:08:24 +02:00  
				
					
						
							
							
								 
						
							
								67432b23c2 
								
							 
						 
						
							
							
								
								[ZARCH] fix cgemv_n_4.c  
							
							
							
						 
						
							2019-01-11 16:44:46 +02:00  
				
					
						
							
							
								 
						
							
								be66f5d5c2 
								
							 
						 
						
							
							
								
								[ZARCH] fix data prefetch type in sdot  
							
							
							
						 
						
							2019-01-09 16:50:07 +02:00  
				
					
						
							
							
								 
						
							
								c2ffef8156 
								
							 
						 
						
							
							
								
								[ZARCH] fix data prefetch type in ddot  
							
							
							
						 
						
							2019-01-09 16:49:44 +02:00  
				
					
						
							
							
								 
						
							
								e7455f500c 
								
							 
						 
						
							
							
								
								[ZARCH] fix dsdot.c  
							
							
							
						 
						
							2019-01-09 16:33:54 +02:00  
				
					
						
							
							
								 
						
							
								3eafcfa650 
								
							 
						 
						
							
							
								
								[ZARCH] fix cgemv_n_4.c  
							
							
							
						 
						
							2019-01-09 07:43:45 +02:00  
				
					
						
							
							
								 
						
							
								94cd946b96 
								
							 
						 
						
							
							
								
								[ZARCH] fix cgemv_n_4.c  
							
							
							
						 
						
							2019-01-04 17:45:56 +02:00  
				
					
						
							
							
								 
						
							
								1aa840a0a2 
								
							 
						 
						
							
							
								
								[ZARCH] fix sgemv_t_4.c  
							
							
							
						 
						
							2019-01-04 01:38:18 +02:00  
				
					
						
							
							
								 
						
							
								795285c587 
								
							 
						 
						
							
							
								
								Fix thinko in skylake beta handling  
							
							... 
							
							
							
							casting ints is cheaper but it has a rounding, not memory casing effect, resulting in
invalid outcome 
							
						 
						
							2018-12-24 18:49:50 +00:00  
				
					
						
							
							
								 
						
							
								d321448a63 
								
							 
						 
						
							
							
								
								dgemm: use dgemm_ncopy_8_skylakex.c also for Haswell  
							
							... 
							
							
							
							The dgemm_ncopy_8_skylakex.c code is not avx512 specific and gives
a nice performance boost for medium sized matrices 
							
						 
						
							2018-12-16 23:09:22 +00:00  
				
					
						
							
							
								 
						
							
								c43331ad0a 
								
							 
						 
						
							
							
								
								dgemm: Use the skylakex beta function also for haswell  
							
							... 
							
							
							
							it's more efficient for certain tall/skinny matrices 
							
						 
						
							2018-12-16 23:09:17 +00:00  
				
					
						
							
							
								 
						
							
								c4e23dd016 
								
							 
						 
						
							
							
								
								Update Makefile  
							
							
							
						 
						
							2018-12-16 18:14:40 +01:00  
				
					
						
							
							
								 
						
							
								cfc4acc221 
								
							 
						 
						
							
							
								
								typo  
							
							
							
						 
						
							2018-12-16 16:19:51 +01:00  
				
					
						
							
							
								 
						
							
								545c2b1bbb 
								
							 
						 
						
							
							
								
								Add -mavx2 on Haswell only if the compiler supports it  
							
							
							
						 
						
							2018-12-16 13:09:19 +01:00  
				
					
						
							
							
								 
						
							
								69d206440a 
								
							 
						 
						
							
							
								
								Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support  
							
							
							
						 
						
							2018-12-16 00:19:41 +00:00  
				
					
						
							
							
								 
						
							
								3843e3e017 
								
							 
						 
						
							
							
								
								use -maxv2 on haswell  
							
							
							
						 
						
							2018-12-15 23:30:31 +01:00  
				
					
						
							
							
								 
						
							
								fbcb14a74b 
								
							 
						 
						
							
							
								
								should be core-avx2  
							
							
							
						 
						
							2018-12-15 20:18:59 +01:00  
				
					
						
							
							
								 
						
							
								2a3190dc76 
								
							 
						 
						
							
							
								
								fix elseifeq and use older option core2-avx for compatibility  
							
							
							
						 
						
							2018-12-15 20:17:44 +01:00  
				
					
						
							
							
								 
						
							
								1ebe5c0f49 
								
							 
						 
						
							
							
								
								Add -march=haswell to HASWELL part of DYNAMIC_ARCH build  
							
							
							
						 
						
							2018-12-15 19:35:35 +01:00  
				
					
						
							
							
								 
						
							
								0586899a10 
								
							 
						 
						
							
							
								
								Use sgemm_ncopy_4_skylakex.c also for Haswell  
							
							... 
							
							
							
							sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the
real perf win happens; this also works great for Haswell.
This gives double digit percentage gains on small and skinny matrices 
							
						 
						
							2018-12-15 13:49:19 +00:00  
				
					
						
							
							
								 
						
							
								00dc09ad19 
								
							 
						 
						
							
							
								
								Use the skylake sgemm beta code also for haswell  
							
							... 
							
							
							
							with a few small changes it's possible to use the skylake sgemm code
also for haswell, this gives a modest gain (10% range) for smallish
matrixes but does wonders for very skinny matrixes 
							
						 
						
							2018-12-15 13:49:13 +00:00  
				
					
						
							
							
								 
						
							
								cdc668d82b 
								
							 
						 
						
							
							
								
								Add a "sgemm direct" mode for small matrixes  
							
							... 
							
							
							
							OpenBLAS has a fancy algorithm for copying the input data while laying
it out in a more CPU friendly memory layout.
This is great for large matrixes; the cost of the copy is easily
ammortized by the gains from the better memory layout.
But for small matrixes (on CPUs that can do efficient unaligned loads) this
copy can be a net loss.
This patch adds (for SKYLAKEX initially) a "sgemm direct" mode, that bypasses
the whole copy machinary for ALPHA=1/BETA=0/... standard arguments,
for small matrixes only.
What is small? For the non-threaded case this has been measured to be
in the M*N*K = 28 * 512 * 512 range, while in the threaded case it's
less, around M*N*K = 1 * 512 * 512 
							
						 
						
							2018-12-13 13:47:31 +00:00  
				
					
						
							
							
								 
						
							
								87718807f0 
								
							 
						 
						
							
							
								
								Merge pull request  #1910  from martin-frbg/issue1909  
							
							... 
							
							
							
							Fix for DYNAMIC_ARCH builds made on a AVX512-capable host 
							
						 
						
							2018-12-12 14:56:25 +01:00  
				
					
						
							
							
								 
						
							
								51aec8e96b 
								
							 
						 
						
							
							
								
								make sure the added march=skylake-avx512 does not cause problems on Windows  
							
							
							
						 
						
							2018-12-11 22:47:32 +01:00  
				
					
						
							
							
								 
						
							
								06f7d78d70 
								
							 
						 
						
							
							
								
								Add -march=skylake-avx512 to SkylakeX part of DYNAMIC_ARCH builds  
							
							
							
						 
						
							2018-12-11 21:10:38 +01:00  
				
					
						
							
							
								 
						
							
								7639f2e1f0 
								
							 
						 
						
							
							
								
								Rewrite the conditional for OSX to fix cmake parsing on others  
							
							... 
							
							
							
							The Makefile variable parser in utils.cmake currently does not handle conditionals. Having the definitions for non-OSX last will at least make cmake builds work again on non-OSX platforms. 
							
						 
						
							2018-12-06 14:04:27 +01:00  
				
					
						
							
							
								 
						
							
								2fc712469d 
								
							 
						 
						
							
							
								
								Avoid creating spurious non-suffixed c/zgemm_kernels  
							
							... 
							
							
							
							Plain cgemm_kernel and zgemm_kernel are not used anywhere, only cgemm_kernel_b etc.
Needlessly building them (without any define like NN, CN, etc.) just happened to work on most platforms, but not on arm64. See #1870  
							
						 
						
							2018-12-06 13:56:06 +01:00  
				
					
						
							
							
								 
						
							
								6ba30e270d 
								
							 
						 
						
							
							
								
								Fix  typo that broke CNRM2 on ARMV8 since 0.3.0  
							
							... 
							
							
							
							must have happened in my #1449  
							
						 
						
							2018-12-06 13:42:25 +01:00  
				
					
						
							
							
								 
						
							
								701ea88347 
								
							 
						 
						
							
							
								
								Use p2align instead of align for OSX compatibility  
							
							... 
							
							
							
							fixes  #1902  
						
							2018-12-03 13:06:43 +01:00  
				
					
						
							
							
								 
						
							
								6c7b691083 
								
							 
						 
						
							
							
								
								Really revert xDOT changes from 1832  
							
							... 
							
							
							
							neglected to rebase #1892  on merging 
							
						 
						
							2018-11-30 21:32:01 +01:00  
				
					
						
							
							
								 
						
							
								5f4c550c27 
								
							 
						 
						
							
							
								
								Merge pull request  #1892  from martin-frbg/mipsdot  
							
							... 
							
							
							
							revert MIPS64 xDOT kernel changes from #1832  
							
						 
						
							2018-11-30 21:28:21 +01:00  
				
					
						
							
							
								 
						
							
								95a5542e3c 
								
							 
						 
						
							
							
								
								Revert DOT kernel changes from  #1834  
							
							... 
							
							
							
							as the failures seen on Loongson3A appear to be limited to DSDOT/SDSDOT (i.e. my hackish "fix" from #1684 ) 
							
						 
						
							2018-11-30 11:16:24 +01:00  
				
					
						
							
							
								 
						
							
								7a2e1bc804 
								
							 
						 
						
							
							
								
								Use generic kernel for DSDOT/SDSDOT  
							
							... 
							
							
							
							as discussed in #1834  
							
						 
						
							2018-11-30 10:57:09 +01:00  
				
					
						
							
							
								 
						
							
								35653e38b3 
								
							 
						 
						
							
							
								
								Merge pull request  #1834  from fengrl/develop  
							
							... 
							
							
							
							register push/pop command change 
							
						 
						
							2018-11-30 10:48:46 +01:00  
				
					
						
							
							
								 
						
							
								19c4bdd8b3 
								
							 
						 
						
							
							
								
								Add return value so that freebsd system clang does not err out  
							
							
							
						 
						
							2018-11-25 21:35:01 +01:00  
				
					
						
							
							
								 
						
							
								310ea55f29 
								
							 
						 
						
							
							
								
								Simplifying ARMv8 build parameters  
							
							... 
							
							
							
							ARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode
(which is not right because TX2 is ARMv8.1) as well as requiring a few
redundancies in the defines, making it harder to maintain and understand
what core has what. A few other minor issues were also fixed.
Tests were made on the following cores: A53, A57, A72, Falkor, ThunderX,
ThunderX2, and XGene.
Tests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester.
A summary:
 * Removed TX2 code from ARMv8 build, to make sure it is compatible with
   all ARMv8 cores, not just v8.1. Also, the TX2 code has actually
   harmed performance on big cores.
 * Commoned up ARMv8 architectures' defines in params.h, to make sure
   that all will benefit from ARMv8 settings, in addition to their own.
 * Adding a few more cores, using ARMv8's include strategy, to benefit
   from compiler optimisations using mtune. Also updated cache
   information from the manuals, making sure we set good conservative
   values by default. Removed Vulcan, as it's an alias to TX2.
 * Auto-detecting most of those cores, but also updating the forced
   compilation in getarch.c, to make sure the parameters are the same
   whether compiled natively or forced arch.
Benefits:
 * ARMv8 build is now guaranteed to work on all ARMv8 cores
 * Improved performance for ARMv8 builds on some cores (A72, Falkor,
   ThunderX1 and 2: up to 11%) over current develop
 * Improved performance for *all* cores comparing to develop branch
   before TX2's patch (9% ~ 36%)
 * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than
   current develop's branch and 8% faster than deveop before tx2 patches
Issues:
 * Regression from current develop branch for A53 (-12%) and A57 (-3%)
   with ARMv8 builds, but still faster than before TX2's commit (+15%
   and +24% respectively). This can be improved with a simplification of
   TX2's code, to be done in future patches. At least the code is
   guaranteed to be ARMv8.0 now.
Comments:
 * CortexA57 builds are unchanged on A57 hardware from develop's branch,
   which makes sense, as it's untouched.
 * CortexA72 builds improve over A57 on A72 hardware, even if they're
   using the same includes due to new compiler tunning in the makefile. 
							
						 
						
							2018-11-19 16:41:49 +00:00  
				
					
						
							
							
								 
						
							
								43bb386b10 
								
							 
						 
						
							
							
								
								fix dot problem on 64bit mips  
							
							
							
						 
						
							2018-11-15 11:11:59 +08:00  
				
					
						
							
							
								 
						
							
								dcc5d6291e 
								
							 
						 
						
							
							
								
								skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case  
							
							... 
							
							
							
							in the threading code there are cases where N or M can become 0,
and the optimized beta code did not handle this well, leading
to a crash
during the audit for the crash a few edge conditions on the if statements
were found and fixed as well 
							
						 
						
							2018-11-01 01:42:09 +00:00  
				
					
						
							
							
								 
						
							
								2d8064174c 
								
							 
						 
						
							
							
								
								register push/pop command change  
							
							... 
							
							
							
							64bit push/pop register command should be used. Otherwise, data will lost. 
							
						 
						
							2018-10-26 17:55:15 +08:00  
				
					
						
							
							
								 
						
							
								d5aeff636f 
								
							 
						 
						
							
							
								
								ARM64: Enable DYNAMIC_ARCH  
							
							... 
							
							
							
							Enable DYNAMIC_ARCH feature on ARM64. This patch uses the cpuid
feature in linux kernel to detect the core type at runtime
(https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt ).
If this feature is missing in kernel, then the user should use the
OPENBLAS_CORETYPE env variable to select the desired core type. 
							
						 
						
							2018-10-22 01:49:35 -07:00  
				
					
						
							
							
								 
						
							
								e7b66cd36e 
								
							 
						 
						
							
							
								
								ARM64: Fix DYNAMIC_ARCH compilation for cores which dont use GEMM3M  
							
							
							
						 
						
							2018-10-22 01:45:51 -07:00  
				
					
						
							
							
								 
						
							
								d50abc8903 
								
							 
						 
						
							
							
								
								ARM64: Move parameters from parameter.c to param.h  
							
							... 
							
							
							
							Remove the runtime setting of P, Q, R parameters for
targets ARMV8, THUNDERX2T99. Instead set them as constants
in param.h at compile time. 
							
						 
						
							2018-10-22 01:45:51 -07:00  
				
					
						
							
							
								 
						
							
								351a0c777c 
								
							 
						 
						
							
							
								
								ARM64: Remove XGENE1 references  
							
							... 
							
							
							
							Remove XGENE1 target as the implementation for the
same is incomplete. Moreover whoever wishes to use
on XGENE1 can use the generic ARMV8 target as there
are no XGENE1 specific optimizations in OpenBLAS. 
							
						 
						
							2018-10-22 01:45:51 -07:00  
				
					
						
							
							
								 
						
							
								21f46a1cf2 
								
							 
						 
						
							
							
								
								ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8  
							
							... 
							
							
							
							Currently the generic ARMV8 target uses C implementations
for many routines. Replace these with the neon implementations
written for THUNDERX2T99 target which are upto 6x faster for
certain routines. 
							
						 
						
							2018-10-17 10:44:37 -07:00  
				
					
						
							
							
								 
						
							
								caf339412f 
								
							 
						 
						
							
							
								
								ARM64: Remove dependency of THUNDERX2T99 Makefile on CORTEXA57 Makefile  
							
							
							
						 
						
							2018-10-17 08:02:40 -07:00  
				
					
						
							
							
								 
						
							
								8001fdcd2a 
								
							 
						 
						
							
							
								
								ARM64: Remove dependency of THUNDERX Makefile on ARMV8 Makefile  
							
							
							
						 
						
							2018-10-17 08:02:16 -07:00  
				
					
						
							
							
								 
						
							
								162e312832 
								
							 
						 
						
							
							
								
								ARM64: Remove dependency of CORTEXA57 Makefile on ARMV8 Makefile  
							
							
							
						 
						
							2018-10-17 08:01:45 -07:00  
				
					
						
							
							
								 
						
							
								c3d93caa8d 
								
							 
						 
						
							
							
								
								ARM64: Remove dependency of XGENE1 Makefile on ARMV8 Makefile  
							
							
							
						 
						
							2018-10-17 08:01:27 -07:00  
				
					
						
							
							
								 
						
							
								55b244ca0d 
								
							 
						 
						
							
							
								
								enable the SGEMM/SKX C based kernel  
							
							... 
							
							
							
							In QA the final bug was found so now the sklyakex sgemm C based kernel can
be activated.... 
							
						 
						
							2018-10-12 09:30:35 +00:00  
				
					
						
							
							
								 
						
							
								d4bad73834 
								
							 
						 
						
							
							
								
								Add a C+intrinsics version of the SGEMM/skylakex kernel  
							
							... 
							
							
							
							for most sizes this is 1.2x to 1.4x faster than the current code 
							
						 
						
							2018-10-10 01:49:22 +00:00  
				
					
						
							
							
								 
						
							
								582c589727 
								
							 
						 
						
							
							
								
								dgemm/skylakex: replace discrete mul/add with fma  
							
							... 
							
							
							
							very minor gains since it's not super hot code, but general principles 
							
						 
						
							2018-10-06 23:13:26 +00:00  
				
					
						
							
							
								 
						
							
								adbf6afa25 
								
							 
						 
						
							
							
								
								Add vector optimizations for ncopy as well for dgemm/skylakex  
							
							
							
						 
						
							2018-10-06 21:18:12 +00:00  
				
					
						
							
							
								 
						
							
								32bec8afbb 
								
							 
						 
						
							
							
								
								add a skylakex optimized dgemm beta function  
							
							
							
						 
						
							2018-10-06 16:36:26 +00:00  
				
					
						
							
							
								 
						
							
								20c5d668fe 
								
							 
						 
						
							
							
								
								dgemm/avx512 simplify and speed up the 4x4 kernel  
							
							
							
						 
						
							2018-10-06 14:12:32 +00:00  
				
					
						
							
							
								 
						
							
								6d43c51ccf 
								
							 
						 
						
							
							
								
								undo slow dgemm/skylake microoptimization  
							
							... 
							
							
							
							the compare is more costly than the work 
							
						 
						
							2018-10-06 14:00:37 +00:00  
				
					
						
							
							
								 
						
							
								d74dc39b0f 
								
							 
						 
						
							
							
								
								Add optimized *copy versions for skylakex  
							
							... 
							
							
							
							Add optimized n/t copy versions for skylakex; in the patch the
tcopy is also rewritten using intrinsics; the ncopy file
will be worked on in a future commit 
							
						 
						
							2018-10-06 13:51:44 +00:00  
				
					
						
							
							
								 
						
							
								66b43affbc 
								
							 
						 
						
							
							
								
								Add a 24x8 kernel to the skylakex dgemm implementation  
							
							... 
							
							
							
							Minor gains for small matrixes, but at 512x512 and above the gain
gets more significant. 
							
						 
						
							2018-10-05 13:22:21 +00:00  
				
					
						
							
							
								 
						
							
								1938819c25 
								
							 
						 
						
							
							
								
								skylake dgemm: Add a 16x8 kernel  
							
							... 
							
							
							
							The next step for the avx512 dgemm code is adding a 16x8 kernel.
In the 8x8 kernel, each FMA has a matching load (the broadcast);
in the 16x8 kernel we can reuse this load for 2 FMAs, which
in turn reduces pressure on the load ports of the CPU and gives
a nice performance boost (in the 25% range). 
							
						 
						
							2018-10-05 13:11:35 +00:00  
				
					
						
							
							
								 
						
							
								b7496c3638 
								
							 
						 
						
							
							
								
								Function name needs to be CNAME, set from outside to allow suffixing for dynamic_arch  
							
							
							
						 
						
							2018-10-04 19:14:59 +02:00  
				
					
						
							
							
								 
						
							
								45fe8cb0c5 
								
							 
						 
						
							
							
								
								Create a AVX512 enabled version of DGEMM  
							
							... 
							
							
							
							This patch adds dgemm_kernel_4x8_skylakex.c which is
* dgemm_kernel_4x8_haswell.s converted to C + intrinsics
* 8x8 support added
* 8x8 kernel implemented using AVX512
Performance is a work in progress, but already shows a 10% - 20%
increase for a wide range of matrix sizes. 
							
						 
						
							2018-10-03 14:45:25 +00:00  
				
					
						
							
							
								 
						
							
								544b069e85 
								
							 
						 
						
							
							
								
								Merge pull request  #1780  from martin-frbg/issue1774-2  
							
							... 
							
							
							
							Convert fldmia/fstmia instructions to UAL syntax for clang7 
							
						 
						
							2018-09-29 09:27:47 +02:00  
				
					
						
							
							
								 
						
							
								9b2a7ad40d 
								
							 
						 
						
							
							
								
								Convert fldmia/fstmia instructions to UAL syntax for clang7  
							
							... 
							
							
							
							second part of fix for #1774 , containing files missed in #1775  
							
						 
						
							2018-09-28 23:05:15 +02:00  
				
					
						
							
							
								 
						
							
								6fc85a6359 
								
							 
						 
						
							
							
								
								test_axpy work error on LOONGSON3A platform  #1777  
							
							
							
						 
						
							2018-09-26 15:14:04 +08:00  
				
					
						
							
							
								 
						
							
								7e5df34e6a 
								
							 
						 
						
							
							
								
								Convert fldmia/fstmia instructions to UAL syntax for clang7  
							
							... 
							
							
							
							fixes  #1774  
						
							2018-09-25 09:41:58 +02:00  
				
					
						
							
							
								 
						
							
								1e531701b7 
								
							 
						 
						
							
							
								
								fix small typo  
							
							
							
						 
						
							2018-09-09 16:52:25 +02:00  
				
					
						
							
							
								 
						
							
								ba4f433321 
								
							 
						 
						
							
							
								
								Merge pull request  #1749  from martin-frbg/issue1531  
							
							... 
							
							
							
							Fix ARMV8 cross-compilation for IOS 
							
						 
						
							2018-09-07 11:02:01 +02:00  
				
					
						
							
							
								 
						
							
								1cb7b9015e 
								
							 
						 
						
							
							
								
								Conditional compilation of assembly files that IOS does not like  
							
							
							
						 
						
							2018-09-04 11:06:51 +02:00  
				
					
						
							
							
								 
						
							
								a4bd41e9f2 
								
							 
						 
						
							
							
								
								Fix paths to C kernels for nrm2  
							
							
							
						 
						
							2018-09-04 10:51:19 +02:00  
				
					
						
							
							
								 
						
							
								e11126b26a 
								
							 
						 
						
							
							
								
								Merge pull request  #1745  from martin-frbg/issue1743  
							
							... 
							
							
							
							Set USE_TRMM for all ZARCH variants to fix TRMM faults with zarch-gen… 
							
						 
						
							2018-08-29 07:43:58 +02:00  
				
					
						
							
							
								 
						
							
								f3fd44a731 
								
							 
						 
						
							
							
								
								Set USE_TRMM for all ZARCH variants to fix TRMM faults with zarch-generic  
							
							... 
							
							
							
							fixes  #1743  
						
							2018-08-28 21:34:07 +02:00  
				
					
						
							
							
								 
						
							
								e6c0e39492 
								
							 
						 
						
							
							
								
								Optimize Zgemv  
							
							
							
						 
						
							2018-08-13 12:23:40 +03:00  
				
					
						
							
							
								 
						
							
								375dff54fc 
								
							 
						 
						
							
							
								
								Merge pull request  #1733  from fenrus75/dsymv  
							
							... 
							
							
							
							Add an AVX512 enabled DSYMV (L) function 
							
						 
						
							2018-08-12 18:18:36 +02:00  
				
					
						
							
							
								 
						
							
								a5f165275a 
								
							 
						 
						
							
							
								
								Merge pull request  #1732  from fenrus75/dgemv  
							
							... 
							
							
							
							Add an AVX512 enabled DGEMV (n)  function 
							
						 
						
							2018-08-12 18:17:42 +02:00  
				
					
						
							
							
								 
						
							
								8c13aa495a 
								
							 
						 
						
							
							
								
								Merge pull request  #1730  from fenrus75/fix-sdot  
							
							... 
							
							
							
							Fix typo in sdot function 
							
						 
						
							2018-08-12 18:17:01 +02:00  
				
					
						
							
							
								 
						
							
								9bec34cb67 
								
							 
						 
						
							
							
								
								Add an AVX512 enabled DSYMV (L) function  
							
							... 
							
							
							
							written in C intrinsics for best readability.
(the same C code works for Haswell as well)
For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough 
							
						 
						
							2018-08-11 17:46:24 +00:00  
				
					
						
							
							
								 
						
							
								87bebdbd8a 
								
							 
						 
						
							
							
								
								Add an AVX512 enabled DGEMV (n)  function  
							
							... 
							
							
							
							written in C intrinsics for best readability.
(the same C code works for Haswell as well)
For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough 
							
						 
						
							2018-08-11 17:38:12 +00:00  
				
					
						
							
							
								 
						
							
								36add7570a 
								
							 
						 
						
							
							
								
								Fix typo in sdot function  
							
							... 
							
							
							
							it looks like my previous pull request was short the final commit;
fix a typo in sdot 
							
						 
						
							2018-08-11 17:16:45 +00:00  
				
					
						
							
							
								 
						
							
								cacacc8007 
								
							 
						 
						
							
							
								
								Add an AVX512 enabled DSCAL function  
							
							... 
							
							
							
							written in C intrinsics for best readability.
(the same C code works for Haswell as well)
For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough 
							
						 
						
							2018-08-11 17:14:57 +00:00  
				
					
						
							
							
								 
						
							
								1a00ef3d27 
								
							 
						 
						
							
							
								
								Merge pull request  #1725  from fenrus75/axpy  
							
							... 
							
							
							
							Add a AVX512 enabled SAXPY/DAXPY functions 
							
						 
						
							2018-08-11 11:01:20 +02:00  
				
					
						
							
							
								 
						
							
								2e99873ff7 
								
							 
						 
						
							
							
								
								Add a AVX512 enabled SAXPY/DAXPY functions  
							
							... 
							
							
							
							written in C intrinsics for best readability.
(the same C code works for Haswell as well)
For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough 
							
						 
						
							2018-08-10 02:58:32 +00:00  
				
					
						
							
							
								 
						
							
								00abaa865b 
								
							 
						 
						
							
							
								
								Add an AVX512 enabled SDOT function  
							
							... 
							
							
							
							written in C intrinsics for best readability.
(the same C code works for Haswell as well)
For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough 
							
						 
						
							2018-08-10 02:33:43 +00:00  
				
					
						
							
							
								 
						
							
								7932ff3ea9 
								
							 
						 
						
							
							
								
								Add an AVX512 enabled DDOT function  
							
							... 
							
							
							
							written in C intrinsics for best readability.
(the same C code works for Haswell as well)
For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough 
							
						 
						
							2018-08-09 03:55:52 +00:00  
				
					
						
							
							
								 
						
							
								23229011db 
								
							 
						 
						
							
							
								
								[ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization  
							
							
							
						 
						
							2018-08-06 18:20:40 +03:00  
				
					
						
							
							
								 
						
							
								4e103c822c 
								
							 
						 
						
							
							
								
								typo fix  
							
							
							
						 
						
							2018-07-16 12:56:39 +02:00  
				
					
						
							
							
								 
						
							
								d2142760e0 
								
							 
						 
						
							
							
								
								Fix precision problem in DSDOT  
							
							
							
						 
						
							2018-07-15 17:11:40 +02:00  
				
					
						
							
							
								 
						
							
								2fbfc64da8 
								
							 
						 
						
							
							
								
								Use C kernels for default c/zAXPY, xROT, c/zSWAP  
							
							
							
						 
						
							2018-07-15 17:09:55 +02:00  
				
					
						
							
							
								 
						
							
								ba8388cee0 
								
							 
						 
						
							
							
								
								Merge pull request  #1651  from martin-frbg/avx512-nodgemm  
							
							... 
							
							
							
							Disable the 16x2 DTRMM kernel on SkylakeX as well 
							
						 
						
							2018-06-30 17:48:03 +02:00  
				
					
						
							
							
								 
						
							
								6e54b0a027 
								
							 
						 
						
							
							
								
								Disable the 16x2 DTRMM kernel on SkylakeX as well  
							
							
							
						 
						
							2018-06-30 17:31:06 +02:00  
				
					
						
							
							
								 
						
							
								40c8cbc3bf 
								
							 
						 
						
							
							
								
								Merge pull request  #1650  from martin-frbg/avx512-nodgemm  
							
							... 
							
							
							
							Disable the AVX512 DGEMM kernel for now 
							
						 
						
							2018-06-30 13:05:46 +02:00  
				
					
						
							
							
								 
						
							
								f0a8dc2eec 
								
							 
						 
						
							
							
								
								Disable the AVX512 DGEMM kernel for now  
							
							... 
							
							
							
							due to #1643  
							
						 
						
							2018-06-30 11:34:48 +02:00  
				
					
						
							
							
								 
						
							
								b83e4c60c7 
								
							 
						 
						
							
							
								
								Remove premature exit for INC_X or INC_Y zero  
							
							
							
						 
						
							2018-06-26 20:46:42 +02:00  
				
					
						
							
							
								 
						
							
								e344db269b 
								
							 
						 
						
							
							
								
								Remove premature exit for INC_X or INC_Y zero  
							
							
							
						 
						
							2018-06-26 20:45:57 +02:00  
				
					
						
							
							
								 
						
							
								545b82efd3 
								
							 
						 
						
							
							
								
								Remove premature exit for INC_X or INC_Y zero  
							
							
							
						 
						
							2018-06-26 20:45:00 +02:00  
				
					
						
							
							
								 
						
							
								e322a951fe 
								
							 
						 
						
							
							
								
								Remove premature exit for INC_X or INC_Y zero  
							
							
							
						 
						
							2018-06-26 20:44:13 +02:00  
				
					
						
							
							
								 
						
							
								c628c6fa59 
								
							 
						 
						
							
							
								
								Merge pull request  #1612  from oon3m0oo/cpus  
							
							... 
							
							
							
							Fixed a few more unnecessary calls to num_cpu_avail. 
							
						 
						
							2018-06-14 16:51:31 +02:00  
				
					
						
							
							
								 
						
							
								6f71c0fce4 
								
							 
						 
						
							
							
								
								Return a somewhat sane default value for L2 cache size if cpuid retur… ( #1611 )  
							
							... 
							
							
							
							* Return a somewhat sane default value for L2 cache size if cpuid returned something unexpected
Fixes  #1610 , the KVM hypervisor on Google Chromebooks returning zero for CPUID  0x80000006, causing DYNAMIC_ARCH
builds of OpenBLAS to hang 
							
						 
						
							2018-06-11 13:26:19 +02:00  
				
					
						
							
							
								 
						
							
								c2545b0fd6 
								
							 
						 
						
							
							
								
								Fixed a few more unnecessary calls to num_cpu_avail.  
							
							... 
							
							
							
							I don't have as many benchmarks for these as for gemm, but it should still
make a difference for small matrices. 
							
						 
						
							2018-06-11 10:17:16 +01:00  
				
					
						
							
							
								 
						
							
								89372e0993 
								
							 
						 
						
							
							
								
								Use AVX512 also for DGEMM  
							
							... 
							
							
							
							this required switching to the generic gemm_beta code (which is faster anyway on SKX)
for both DGEMM and SGEMM
Performance for the not-retuned version is in the 30% range 
							
						 
						
							2018-06-03 22:17:27 +00:00  
				
					
						
							
							
								 
						
							
								0023515733 
								
							 
						 
						
							
							
								
								Typo fix (misplaced parenthesis)  
							
							
							
						 
						
							2018-06-03 13:22:59 +02:00  
				
					
						
							
							
								 
						
							
								99c7bba8e4 
								
							 
						 
						
							
							
								
								Initial support for SkylakeX / AVX512  
							
							... 
							
							
							
							This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server)
target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set,
which brings 2 basic things:
1) 512 bit wide SIMD (2x width of AVX2)
2) 32 SIMD registers (2x the number on AVX2)
This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel
to AVX512VL; more will follow later but this patch aims to get the infrastructure
in place for this "later".
Full performance tuning has not been done yet; with more registers and wider SIMD
it's in theory possible to retune the kernels but even without that there's an
interesting enough performance increase (30-40% range) with just this change. 
							
						 
						
							2018-06-03 07:58:52 +00:00  
				
					
						
							
							
								 
						
							
								8562d5787a 
								
							 
						 
						
							
							
								
								Merge pull request  #1583  from martin-frbg/issue1575  
							
							... 
							
							
							
							Handle INCX=0,INCY=0 case 
							
						 
						
							2018-05-31 21:55:26 +02:00  
				
					
						
							
							
								 
						
							
								7df8c4f76f 
								
							 
						 
						
							
							
								
								typo fix  
							
							
							
						 
						
							2018-05-31 17:23:08 +02:00  
				
					
						
							
							
								 
						
							
								2fc748bf72 
								
							 
						 
						
							
							
								
								Restore optimized swap kernel now that we have a proper fix  
							
							
							
						 
						
							2018-05-31 13:41:12 +02:00  
				
					
						
							
							
								 
						
							
								d1b7be14aa 
								
							 
						 
						
							
							
								
								Handle INCX=0,INCY=0 case  
							
							... 
							
							
							
							Fixes  #1575  (sswap/dswap failing the swap utest on x86) as suggested by atsampson. 
						
							2018-05-31 12:52:04 +02:00  
				
					
						
							
							
								 
						
							
								961d25e9c7 
								
							 
						 
						
							
							
								
								Use the new zrot.c on POWER8 for crot as well  
							
							... 
							
							
							
							fixes  #1571  (the old zrot.S assembly does not handle incx=0 correctly) 
						
							2018-05-23 22:54:39 +02:00  
				
					
						
							
							
								 
						
							
								f5959f2543 
								
							 
						 
						
							
							
								
								Merge pull request  #1567  from martin-frbg/mipstrmm  
							
							... 
							
							
							
							Revert " Switch mips32 target to USE_TRMM to fix complex TRMM" 
							
						 
						
							2018-05-17 20:50:23 +02:00  
				
					
						
							
							
								 
						
							
								82012b960b 
								
							 
						 
						
							
							
								
								Revert " Switch mips32 target to USE_TRMM to fix complex TRMM"  
							
							... 
							
							
							
							... as it was just a silly workaround for the issue seen in #1563 , caused by #1419  
							
						 
						
							2018-05-17 20:30:03 +02:00  
				
					
						
							
							
								 
						
							
								8dd3515fa2 
								
							 
						 
						
							
							
								
								Merge pull request  #1565  from martin-frbg/mipstypo  
							
							... 
							
							
							
							Remove extraneous brace from previous commit of mips dsdot fix 
							
						 
						
							2018-05-17 20:22:58 +02:00  
				
					
						
							
							
								 
						
							
								95f7f0229c 
								
							 
						 
						
							
							
								
								Remove extraneous brace from previous commit  
							
							
							
						 
						
							2018-05-17 18:43:59 +02:00  
				
					
						
							
							
								 
						
							
								5082fe4306 
								
							 
						 
						
							
							
								
								Merge pull request  #1564  from martin-frbg/issue1563  
							
							... 
							
							
							
							Revert changes from PR#1419 
							
						 
						
							2018-05-17 14:04:13 +02:00  
				
					
						
							
							
								 
						
							
								7a7619af6d 
								
							 
						 
						
							
							
								
								Revert changes from PR#1419  
							
							... 
							
							
							
							at least one of these changes apparently is an oversimplification, leading to TRMM breakage on some platforms as observed in #1563  
							
						 
						
							2018-05-17 11:40:08 +02:00  
				
					
						
							
							
								 
						
							
								893b535540 
								
							 
						 
						
							
							
								
								Use correct data type for initializers of v2f64, v4f32  
							
							... 
							
							
							
							Fixes  #1561  
						
							2018-05-15 14:42:12 +02:00  
				
					
						
							
							
								 
						
							
								018f2dad27 
								
							 
						 
						
							
							
								
								Switch mips32 target to USE_TRMM to fix complex TRMM  
							
							
							
						 
						
							2018-05-02 20:25:32 +02:00  
				
					
						
							
							
								 
						
							
								9d5098dbc9 
								
							 
						 
						
							
							
								
								Add MIPS 1004K target (Mediatek MT7621 SOC)  
							
							
							
						 
						
							2018-05-02 20:20:44 +02:00  
				
					
						
							
							
								 
						
							
								954f1832de 
								
							 
						 
						
							
							
								
								Merge pull request  #1540  from martin-frbg/mips32-zasum  
							
							... 
							
							
							
							Fix typo in MIPS P5600 complex ASUM code selection 
							
						 
						
							2018-04-25 23:23:00 +02:00  
				
					
						
							
							
								 
						
							
								941ad280a8 
								
							 
						 
						
							
							
								
								Fix typo in MIPS P5600 complex ASUM code selection  
							
							
							
						 
						
							2018-04-25 22:50:10 +02:00  
				
					
						
							
							
								 
						
							
								1da365312a 
								
							 
						 
						
							
							
								
								Merge pull request  #1538  from martin-frbg/arm7utest  
							
							... 
							
							
							
							Fix handling of zero INCX, INCY in ArmV7 AXPY and ROT 
							
						 
						
							2018-04-25 08:38:58 +02:00  
				
					
						
							
							
								 
						
							
								2d0929fa7c 
								
							 
						 
						
							
							
								
								Move the test for zero incx,incy in ARMV7 ROT  
							
							... 
							
							
							
							to pass the related utest (see #1469 ) 
							
						 
						
							2018-04-24 22:43:00 +02:00  
				
					
						
							
							
								 
						
							
								125343cc88 
								
							 
						 
						
							
							
								
								Drop test for zero incx,incy in armv7 AXPY  
							
							... 
							
							
							
							...to pass the related utest (see #1469 ) 
							
						 
						
							2018-04-24 22:39:50 +02:00  
				
					
						
							
							
								 
						
							
								8a3b6fa108 
								
							 
						 
						
							
							
								
								Use generic zrot.c on ppc64/POWER6 to work around utest failure from … ( #1535 )  
							
							... 
							
							
							
							* Use generic C implementation of zrot on ppc64/POWER6 to work around utest failure from #1469  
							
						 
						
							2018-04-23 19:05:49 +02:00  
				
					
						
							
							
								 
						
							
								9c5518319a 
								
							 
						 
						
							
							
								
								Revert "Fix 32bit HASWELL builds"  
							
							
							
						 
						
							2018-04-22 20:20:04 +02:00  
				
					
						
							
							
								 
						
							
								0ee395db35 
								
							 
						 
						
							
							
								
								Fixed TRMM and SYMM for RISCV  
							
							
							
						 
						
							2018-04-18 18:03:32 -07:00  
				
					
						
							
							
								 
						
							
								c167a3d6f4 
								
							 
						 
						
							
							
								
								Added RISCV build  
							
							
							
						 
						
							2018-04-16 14:08:31 -07:00  
				
					
						
							
							
								 
						
							
								2ca0faf495 
								
							 
						 
						
							
							
								
								Merge pull request  #1515  from martin-frbg/mipsdot  
							
							... 
							
							
							
							Correct precision of mips dsdot 
							
						 
						
							2018-04-11 08:21:25 +02:00  
				
					
						
							
							
								 
						
							
								0fe434598b 
								
							 
						 
						
							
							
								
								Fix precision of mips dsdot  
							
							
							
						 
						
							2018-04-10 23:30:59 +02:00  
				
					
						
							
							
								 
						
							
								c7b55b6082 
								
							 
						 
						
							
							
								
								Merge pull request  #1499  from quickwritereader/develop  
							
							... 
							
							
							
							Implemented missing vsx simd  kernels for power8 blas1/2 double. z13 modifications 
							
						 
						
							2018-03-27 21:43:23 +02:00  
				
					
						
							
							
								 
						
							
								840e01061f 
								
							 
						 
						
							
							
								
								Merge pull request  #1491  from martin-frbg/ddot_mt  
							
							... 
							
							
							
							Add multithreading support for Haswell DDOT 
							
						 
						
							2018-03-27 21:43:05 +02:00  
				
					
						
							
							
								 
						
							
								28ca97015d 
								
							 
						 
						
							
							
								
								power8:Added initial zgemv_(t|n) ,i(d|z)amax,i(d|z)amin,dgemv_t(transposed),zrot  
							
							... 
							
							
							
							z13: improved zgemv_(t|n)_4,zscal,zaxpy 
							
						 
						
							2018-03-27 14:54:41 +00:00  
				
					
						
							
							
								 
						
							
								6a6ffaff1e 
								
							 
						 
						
							
							
								
								Merge pull request  #1494  from martin-frbg/x86_dsdot  
							
							... 
							
							
							
							Use generic/dot.c instead of the inferior arm/dot.c for x86 DSDOT 
							
						 
						
							2018-03-17 15:26:47 +01:00  
				
					
						
							
							
								 
						
							
								28ac9ea5a6 
								
							 
						 
						
							
							
								
								Use generic/dot.c instead of the inferior arm/dot.c for x86 DSDOT  
							
							... 
							
							
							
							to resolve dsdot utest failure seen in #1492  
							
						 
						
							2018-03-17 13:49:15 +01:00  
				
					
						
							
							
								 
						
							
								a55694dd5b 
								
							 
						 
						
							
							
								
								Declare dot_compute static to avoid conflicts in multiarch builds  
							
							
							
						 
						
							2018-03-16 22:23:36 +01:00  
				
					
						
							
							
								 
						
							
								85a41e9cdb 
								
							 
						 
						
							
							
								
								Add multithreading support for Haswell DDOT  
							
							... 
							
							
							
							copied from ashwinyes' implementation in dot_thunderx2t99.c 
							
						 
						
							2018-03-16 16:58:47 +01:00  
				
					
						
							
							
								 
						
							
								81215711a2 
								
							 
						 
						
							
							
								
								Re-enable DAXPY microkernels  for x86_64  
							
							... 
							
							
							
							as the inaccuracies seen in the original testcase for #1332  appear to be due to an artefact that amplifies the very small rounding differences between FMA and discrete multiply+add 
							
						 
						
							2018-03-04 19:37:03 +01:00  
				
					
						
							
							
								 
						
							
								22167170b3 
								
							 
						 
						
							
							
								
								Merge pull request  #1477  from quickwritereader/develop  
							
							... 
							
							
							
							Power8 blas3 copy-pack routines 
							
						 
						
							2018-02-28 18:46:54 +01:00  
				
					
						
							
							
								 
						
							
								fa9ca65c0e 
								
							 
						 
						
							
							
								
								ARM64: Fix utest dsdot errors  
							
							
							
						 
						
							2018-02-27 10:47:55 +00:00  
				
					
						
							
							
								 
						
							
								719b68f077 
								
							 
						 
						
							
							
								
								Merge pull request  #1473  from martin-frbg/p2align  
							
							... 
							
							
							
							Replace .align with .p2aligns in dscal.c and the Nehalem microkernels as well 
							
						 
						
							2018-02-27 08:28:20 +01:00  
				
					
						
							
							
								 
						
							
								fe9f15f2d8 
								
							 
						 
						
							
							
								
								Merge pull request  #1472  from martin-frbg/utest-fixes  
							
							... 
							
							
							
							Fix limited DSDOT precision on arm,aarch64 and zarch 
							
						 
						
							2018-02-26 22:48:07 +01:00  
				
					
						
							
							
								 
						
							
								497f0c3d8a 
								
							 
						 
						
							
							
								
								Replace .align with .p2align in the Nehalem microkernels  
							
							
							
						 
						
							2018-02-26 20:58:33 +01:00  
				
					
						
							
							
								 
						
							
								ea37db828e 
								
							 
						 
						
							
							
								
								Convert .align to .p2align for OSX compatibility  
							
							
							
						 
						
							2018-02-26 20:48:03 +01:00  
				
					
						
							
							
								 
						
							
								6e70287776 
								
							 
						 
						
							
							
								
								Use generic/dot.c for DSDOT on ARMV5 and above  
							
							... 
							
							
							
							The default arm/dot.c is less precise when used for DSDOT, as shown by utest 
							
						 
						
							2018-02-25 19:57:23 +01:00  
				
					
						
							
							
								 
						
							
								58f236ad73 
								
							 
						 
						
							
							
								
								Use generic/dot.c for DSDOT on zarch  
							
							
							
						 
						
							2018-02-25 19:52:14 +01:00  
				
					
						
							
							
								 
						
							
								e207107150 
								
							 
						 
						
							
							
								
								Use generic/dot.c for DSDOT on z13  
							
							... 
							
							
							
							The implementation in arm/dot.c has lower precision, as shown by the utest for dsdot. 
							
						 
						
							2018-02-25 19:51:25 +01:00  
				
					
						
							
							
								 
						
							
								c9d408064a 
								
							 
						 
						
							
							
								
								Use dot.S also for DSDOT on CORTEXA57  
							
							
							
						 
						
							2018-02-25 19:48:09 +01:00  
				
					
						
							
							
								 
						
							
								288d1a3f6e 
								
							 
						 
						
							
							
								
								Use dot.S also for DSDOT on ARMV8  
							
							
							
						 
						
							2018-02-25 19:45:16 +01:00  
				
					
						
							
							
								 
						
							
								7c1925acec 
								
							 
						 
						
							
							
								
								Use .p2align instead of .align for compatibility on Sandybridge as well  
							
							
							
						 
						
							2018-02-24 19:43:15 +01:00  
				
					
						
							
							
								 
						
							
								2359c7c1a9 
								
							 
						 
						
							
							
								
								Use .p2align instead of .align for portability  
							
							... 
							
							
							
							The OSX assembler apparently mishandles the argument to decimal .align, leading to a significant loss of performance 
as observed in #730 , #901  and most recently #1470  
							
						 
						
							2018-02-24 17:50:13 +01:00  
				
					
						
							
							
								 
						
							
								e7366a4161 
								
							 
						 
						
							
							
								
								Restore the remaining utests ( #1462 )  
							
							... 
							
							
							
							* Restore the remaining utests
* Try fork test on Cygwin and Linux only, it hangs on at least ARMv8/Android as well
* Use generic sswap/dswap kernels for NEHALEM 32bit to fix fault found by the restored swap utest
* Disable zdotu test for MS cl to work around runtime error -1073741819 on AppVeyor for now
(probably coding error in the initialization of the complex numbers or wrong choice of zdotu API) 
							
						 
						
							2018-02-20 10:07:17 +01:00  
				
					
						
							
							
								 
						
							
								2c0a008281 
								
							 
						 
						
							
							
								
								dgemm_ncopy_4_ save/restore  
							
							
							
						 
						
							2018-02-18 01:30:17 +00:00  
				
					
						
							
							
								 
						
							
								c5425daa6b 
								
							 
						 
						
							
							
								
								power8 ?gemm_tcopy save/restore  
							
							
							
						 
						
							2018-02-16 23:36:46 +00:00  
				
					
						
							
							
								 
						
							
								b47e6822aa 
								
							 
						 
						
							
							
								
								Enable most assembly kernels in the generic ARMV8 target  
							
							... 
							
							
							
							ref #1439  
							
						 
						
							2018-02-06 11:42:58 +01:00  
				
					
						
							
							
								 
						
							
								60596a1abc 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into develop  
							
							
							
						 
						
							2018-01-31 16:17:04 -08:00  
				
					
						
							
							
								 
						
							
								afd514c25d 
								
							 
						 
						
							
							
								
								small fix inside ifdef z13mvc . (z13mvc code is not used in production)  
							
							
							
						 
						
							2018-01-31 18:30:59 -05:00  
				
					
						
							
							
								 
						
							
								f45776ec1f 
								
							 
						 
						
							
							
								
								Merge pull request  #1440  from quickwritereader/develop  
							
							... 
							
							
							
							small corrections 
							
						 
						
							2018-01-31 23:48:47 +01:00  
				
					
						
							
							
								 
						
							
								e388459a27 
								
							 
						 
						
							
							
								
								Merge pull request  #1419  from brada4/develop  
							
							... 
							
							
							
							Initialize unitialized values for repeated calls 
							
						 
						
							2018-01-31 23:48:34 +01:00  
				
					
						
							
							
								 
						
							
								f653e7a18d 
								
							 
						 
						
							
							
								
								small fix  
							
							... 
							
							
							
							small fix inside ifdef z13mvc . (z13mvc code is not used in production) 
							
						 
						
							2018-01-31 07:49:38 -08:00  
				
					
						
							
							
								 
						
							
								f946a89432 
								
							 
						 
						
							
							
								
								zscal (case: real alpha=0 ) mikrokernel shift&mem fix , da_i as input reg. small typo fixes  
							
							
							
						 
						
							2018-01-26 19:25:27 -08:00  
				
					
						
							
							
								 
						
							
								485df77612 
								
							 
						 
						
							
							
								
								Make USE_TRMM depend on TARGET_CORE not TARGET  
							
							... 
							
							
							
							Fixes  #1432  (and possibly other DTRMM-related failures on Haswell and related architectures when built with cmake) 
						
							2018-01-26 23:20:00 +01:00  
				
					
						
							
							
								 
						
							
								e4c71a799a 
								
							 
						 
						
							
							
								
								Merge pull request  #1426  from quickwritereader/develop  
							
							... 
							
							
							
							(Z13 ) Blas1 mikrokernels can be inlined by gcc. Refactoring,fixes,tunings 
							
						 
						
							2018-01-20 17:34:54 +01:00  
				
					
						
							
							
								 
						
							
								2619ad7ea5 
								
							 
						 
						
							
							
								
								Blas1 mikrokernels can be inlined by gcc. Refactoring ( symbolic operan  
							
							... 
							
							
							
							names). Some fixes and tunings 
							
						 
						
							2018-01-19 19:24:35 -08:00  
				
					
						
							
							
								 
						
							
								e5cc3d72c0 
								
							 
						 
						
							
							
								
								core.IdenticalExpr clang501 checker  
							
							
							
						 
						
							2018-01-19 23:17:43 +01:00  
				
					
						
							
							
								 
						
							
								4938faa822 
								
							 
						 
						
							
							
								
								core.IdenticalExpr clang501 checker  
							
							
							
						 
						
							2018-01-19 23:15:58 +01:00  
				
					
						
							
							
								 
						
							
								9fa986337d 
								
							 
						 
						
							
							
								
								add missing brackets to silence indentation warnings gcc721  
							
							
							
						 
						
							2018-01-19 23:11:12 +01:00  
				
					
						
							
							
								 
						
							
								3eed97f6b9 
								
							 
						 
						
							
							
								
								Initialize values to silence cppcheck  
							
							
							
						 
						
							2018-01-12 22:35:00 +01:00  
				
					
						
							
							
								 
						
							
								13e137fbc9 
								
							 
						 
						
							
							
								
								Initialize uninitialized variables (cppcheck)  
							
							
							
						 
						
							2018-01-12 22:33:41 +01:00  
				
					
						
							
							
								 
						
							
								3d23f45107 
								
							 
						 
						
							
							
								
								Merge pull request  #1415  from quickwritereader/develop  
							
							... 
							
							
							
							(Z systems Z13) small fixes, some (i(dz)amin,i(dz)amax,(dz)dot,(dz)asum) mikrokernels… 
							
						 
						
							2018-01-11 08:35:02 +01:00  
				
					
						
							
							
								 
						
							
								87669d1c0a 
								
							 
						 
						
							
							
								
								small fixes, some (i(dz)amin,i(dz)amax,(dz)dot,(dz)asum) mikrokernels can be inlined  
							
							
							
						 
						
							2018-01-10 20:36:53 -05:00  
				
					
						
							
							
								 
						
							
								42285d8e70 
								
							 
						 
						
							
							
								
								Merge pull request  #1410  from brada4/develop  
							
							... 
							
							
							
							Address warnings #1357  
							
						 
						
							2018-01-06 20:02:46 +01:00  
				
					
						
							
							
								 
						
							
								d602b99386 
								
							 
						 
						
							
							
								
								LAPACK helpers in C that need care too  
							
							
							
						 
						
							2018-01-02 14:38:50 +01:00  
				
					
						
							
							
								 
						
							
								4d0b005e5b 
								
							 
						 
						
							
							
								
								Eliminate remaining unused results in kernels (clang5 analyzer)  
							
							
							
						 
						
							2018-01-01 20:54:39 +01:00  
				
					
						
							
							
								 
						
							
								b81656936f 
								
							 
						 
						
							
							
								
								Merge pull request  #1409  from martin-frbg/issue1292-2  
							
							... 
							
							
							
							Tag %1 and %2 as both input and output operands 
							
						 
						
							2017-12-31 20:18:48 +01:00  
				
					
						
							
							
								 
						
							
								b973990df2 
								
							 
						 
						
							
							
								
								Tag %1 and %2 as both input and output operands  
							
							... 
							
							
							
							fix from #1292  extended to the other gemv microkernels 
							
						 
						
							2017-12-31 18:03:36 +01:00  
				
					
						
							
							
								 
						
							
								1e31124eb0 
								
							 
						 
						
							
							
								
								Merge pull request  #1406  from martin-frbg/issue1292  
							
							... 
							
							
							
							Tag %1 and %2 as both input and output 
							
						 
						
							2017-12-30 14:52:03 +01:00  
				
					
						
							
							
								 
						
							
								cc9500db41 
								
							 
						 
						
							
							
								
								Merge pull request  #1403  from brada4/develop  
							
							... 
							
							
							
							Address few more warnings 
							
						 
						
							2017-12-30 14:51:34 +01:00  
				
					
						
							
							
								 
						
							
								723f396a20 
								
							 
						 
						
							
							
								
								Tag %1 and %2 as both input and output  
							
							... 
							
							
							
							The inline assembly modifies its input operands, so mark them as output to avoid surprises with optimization. Fixes  #1292  
							
						 
						
							2017-12-29 23:56:41 +01:00  
				
					
						
							
							
								 
						
							
								03e5ff0687 
								
							 
						 
						
							
							
								
								initialize potentially unitialized variables (clang5)  
							
							
							
						 
						
							2017-12-26 09:24:24 +01:00  
				
					
						
							
							
								 
						
							
								47deec2c1a 
								
							 
						 
						
							
							
								
								fix couple of dead assignment warnings  
							
							
							
						 
						
							2017-12-22 00:56:35 +01:00  
				
					
						
							
							
								 
						
							
								43c0622e7b 
								
							 
						 
						
							
							
								
								Retire Piledriver/Steamroller/Excavator daxpy microkernels as well  
							
							... 
							
							
							
							related to issue #1332  
							
						 
						
							2017-12-13 18:40:39 +01:00  
				
					
						
							
							
								 
						
							
								0623636c98 
								
							 
						 
						
							
							
								
								Use Sandybridge daxpy kernel on Haswell and Zen for now  
							
							... 
							
							
							
							The testcase from #1332  exposes a problem in daxpy_microk_haswell-2.c that is not seen with
any of the other Intel x86_64 microkernels. 
							
						 
						
							2017-12-10 19:24:31 +01:00  
				
					
						
							
							
								 
						
							
								281a2b952f 
								
							 
						 
						
							
							
								
								warning cleanup ( #1380 )  
							
							... 
							
							
							
							* dead increments in driver/level2
* dead increments in kernel/generic
* part dead increments in kernel/x86_64 
							
						 
						
							2017-12-05 19:54:10 +01:00  
				
					
						
							
							
								 
						
							
								8213385ab8 
								
							 
						 
						
							
							
								
								Work around compiler warnings for unused variables in the generic zgemm3m_Xcopy kernels  
							
							
							
						 
						
							2017-12-02 22:51:58 +01:00  
				
					
						
							
							
								 
						
							
								db00a51e6b 
								
							 
						 
						
							
							
								
								Merge pull request  #1371  from martin-frbg/develop  
							
							... 
							
							
							
							Add trivially optimized DSDOT for POWER8 
							
						 
						
							2017-11-29 19:55:21 +01:00  
				
					
						
							
							
								 
						
							
								7a4b3cfbf8 
								
							 
						 
						
							
							
								
								Add trivially optimized DSDOT for POWER8  
							
							
							
						 
						
							2017-11-28 18:38:07 +01:00  
				
					
						
							
							
								 
						
							
								6c77b5f267 
								
							 
						 
						
							
							
								
								Merge pull request  #1369  from martin-frbg/dsdot  
							
							... 
							
							
							
							Add optimized dsdot to all other x86_64 kernels that use sdot.c 
							
						 
						
							2017-11-28 18:15:31 +01:00  
				
					
						
							
							
								 
						
							
								441a9c8385 
								
							 
						 
						
							
							
								
								more dead increments clang4 scan-build deadcode.deadstores  
							
							
							
						 
						
							2017-11-26 17:24:08 +01:00  
				
					
						
							
							
								 
						
							
								1236dbe5a6 
								
							 
						 
						
							
							
								
								Eliminate 2-8 dead increments code  
							
							
							
						 
						
							2017-11-26 13:26:11 +01:00  
				
					
						
							
							
								 
						
							
								c92cd6d162 
								
							 
						 
						
							
							
								
								Add trivially optimized dsdot based on sdot  
							
							
							
						 
						
							2017-11-24 20:05:27 +01:00  
				
					
						
							
							
								 
						
							
								cae5d9a20b 
								
							 
						 
						
							
							
								
								Add trivially optimized dsdot based on sdot  
							
							
							
						 
						
							2017-11-24 20:04:29 +01:00  
				
					
						
							
							
								 
						
							
								3d891c3106 
								
							 
						 
						
							
							
								
								Add trivially optimized dsdot based on sdot  
							
							
							
						 
						
							2017-11-24 20:03:40 +01:00  
				
					
						
							
							
								 
						
							
								4fbdcfa823 
								
							 
						 
						
							
							
								
								Add trivially optimized dsdot based on sdot  
							
							
							
						 
						
							2017-11-24 20:02:28 +01:00  
				
					
						
							
							
								 
						
							
								1bb6a96ebc 
								
							 
						 
						
							
							
								
								Add trivially optimized dsdot based on sdot  
							
							
							
						 
						
							2017-11-24 20:01:42 +01:00  
				
					
						
							
							
								 
						
							
								6bd163f37a 
								
							 
						 
						
							
							
								
								Add trivially optimized dsdot based on sdot  
							
							
							
						 
						
							2017-11-24 20:00:23 +01:00  
				
					
						
							
							
								 
						
							
								f0333333d1 
								
							 
						 
						
							
							
								
								Add trivially optimized dsdot based on sdot  
							
							
							
						 
						
							2017-11-24 19:59:28 +01:00  
				
					
						
							
							
								 
						
							
								e89b979b2c 
								
							 
						 
						
							
							
								
								fix spurious compiler warning fix (no code change)  
							
							
							
						 
						
							2017-11-24 18:39:04 +01:00  
				
					
						
							
							
								 
						
							
								7e9b29b9b8 
								
							 
						 
						
							
							
								
								fix spurious compiler warning (no code change)  
							
							
							
						 
						
							2017-11-24 18:36:37 +01:00  
				
					
						
							
							
								 
						
							
								6157d0902a 
								
							 
						 
						
							
							
								
								Merge pull request  #1358  from martin-frbg/unused_vars  
							
							... 
							
							
							
							Clean up spurious unused variables in the kernels 
							
						 
						
							2017-11-15 11:31:43 +01:00  
				
					
						
							
							
								 
						
							
								3fea849bbf 
								
							 
						 
						
							
							
								
								Remove unused variables from Haswell dtrmm and Bulldozer dtrsm  
							
							
							
						 
						
							2017-11-14 23:35:10 +01:00  
				
					
						
							
							
								 
						
							
								8f177621bc 
								
							 
						 
						
							
							
								
								Remove unused variables at0...at3 from ?symv_U  
							
							
							
						 
						
							2017-11-14 23:32:25 +01:00