ce329ab686 
								
							 
						 
						
							
							
								
								add sve zhemm copy routines  
							
							
							
						 
						
							2022-01-03 15:56:05 +01:00  
				
					
						
							
							
								 
						
							
								0140373802 
								
							 
						 
						
							
							
								
								add sve ztrmm  
							
							
							
						 
						
							2022-01-02 19:15:33 +01:00  
				
					
						
							
							
								 
						
							
								f7b6912868 
								
							 
						 
						
							
							
								
								ztrmm sve copy kernels  
							
							
							
						 
						
							2021-12-30 21:00:16 +01:00  
				
					
						
							
							
								 
						
							
								40b14e4957 
								
							 
						 
						
							
							
								
								fix zgemm kernel  
							
							
							
						 
						
							2021-12-29 11:42:04 +01:00  
				
					
						
							
							
								 
						
							
								6ec4aab875 
								
							 
						 
						
							
							
								
								zgemm sve copy routines  
							
							
							
						 
						
							2021-12-26 17:05:46 +01:00  
				
					
						
							
							
								 
						
							
								878064f394 
								
							 
						 
						
							
							
								
								sve zgemm kernel  
							
							
							
						 
						
							2021-12-26 08:44:05 +01:00  
				
					
						
							
							
								 
						
							
								683a7548bf 
								
							 
						 
						
							
							
								
								added macros for sve zgemm kernels  
							
							
							
						 
						
							2021-12-25 11:46:41 +01:00  
				
					
						
							
							
								 
						
							
								7b146e590c 
								
							 
						 
						
							
							
								
								fix function typecast  
							
							
							
						 
						
							2021-12-24 20:01:52 +01:00  
				
					
						
							
							
								 
						
							
								e9a0e52201 
								
							 
						 
						
							
							
								
								fix function typecast  
							
							
							
						 
						
							2021-12-24 20:00:50 +01:00  
				
					
						
							
							
								 
						
							
								d1ee6ff73f 
								
							 
						 
						
							
							
								
								fix function typecasts  
							
							
							
						 
						
							2021-12-21 18:45:28 +01:00  
				
					
						
							
							
								 
						
							
								e3c9947c0f 
								
							 
						 
						
							
							
								
								prepare kernel for sve zgemm  
							
							
							
						 
						
							2021-12-21 11:19:27 +01:00  
				
					
						
							
							
								 
						
							
								8d9b9c6b2a 
								
							 
						 
						
							
							
								
								loongarch64: Optimize dgemm_kernel  
							
							
							
						 
						
							2021-12-21 09:33:06 +08:00  
				
					
						
							
							
								 
						
							
								92b7b949dd 
								
							 
						 
						
							
							
								
								fix bug in zscal function  
							
							... 
							
							
							
							memset can not be used in zscal because of
the stride parameters.
Signed-off-by: Wu Zhigang <zhigang.wu@starfivetech.com> 
							
						 
						
							2021-12-15 01:23:30 -08:00  
				
					
						
							
							
								 
						
							
								b0a590f4fe 
								
							 
						 
						
							
							
								
								Merge pull request  #3475  from wjc404/optimize-A53-dgemm  
							
							... 
							
							
							
							optimize cgemm on ARM cortex A53 & cortex A55 
							
						 
						
							2021-12-12 19:09:08 +01:00  
				
					
						
							
							
								 
						
							
								f4d1f0333b 
								
							 
						 
						
							
							
								
								Merge pull request  #3474  from rafaelcfsousa/rafael/cmake_power  
							
							... 
							
							
							
							Add CMake support for Power 
							
						 
						
							2021-12-12 19:08:27 +01:00  
				
					
						
							
							
								 
						
							
								b610d2de37 
								
							 
						 
						
							
							
								
								optimize cgemm on ARM cortex A53 & cortex A55  
							
							
							
						 
						
							2021-12-12 17:22:52 +08:00  
				
					
						
							
							
								 
						
							
								697e2752d7 
								
							 
						 
						
							
							
								
								Merge pull request  #3464  from binebrank/arm_sve_sgemm  
							
							... 
							
							
							
							Add sgemm part for Arm SVE 
							
						 
						
							2021-12-11 20:35:22 +01:00  
				
					
						
							
							
								 
						
							
								a8f62a347b 
								
							 
						 
						
							
							
								
								fix UNROLL_MN and add to targets for SVE  
							
							
							
						 
						
							2021-12-11 16:37:23 +01:00  
				
					
						
							
							
								 
						
							
								774267fdac 
								
							 
						 
						
							
							
								
								adjust Makefile.L3 for SVE  
							
							
							
						 
						
							2021-12-11 16:35:08 +01:00  
				
					
						
							
							
								 
						
							
								23a7561353 
								
							 
						 
						
							
							
								
								Fix error cmake (small kernels)  
							
							
							
						 
						
							2021-12-09 09:57:39 -06:00  
				
					
						
							
							
								 
						
							
								5378046abd 
								
							 
						 
						
							
							
								
								roll back DGEMM kernels to 4x8 when compiling for DYNAMIC_ARCH  
							
							
							
						 
						
							2021-12-06 19:43:54 +01:00  
				
					
						
							
							
								 
						
							
								a1fea1fe2a 
								
							 
						 
						
							
							
								
								sgemm v2x8 SVE kernel  
							
							
							
						 
						
							2021-12-05 18:47:29 +01:00  
				
					
						
							
							
								 
						
							
								abe1ce3434 
								
							 
						 
						
							
							
								
								strmm sve v1x8 kernel  
							
							
							
						 
						
							2021-12-05 14:03:08 +01:00  
				
					
						
							
							
								 
						
							
								54d321d742 
								
							 
						 
						
							
							
								
								Merge pull request  #3466  from rafaelcfsousa/rafael/small_matrix_p10  
							
							... 
							
							
							
							[POWER] Add small matrix for sgemm/dgemm on Power10 
							
						 
						
							2021-12-03 12:12:20 +01:00  
				
					
						
							
							
								 
						
							
								0882db30a2 
								
							 
						 
						
							
							
								
								Merge pull request  #3455  from cenewcombe/develop  
							
							... 
							
							
							
							Fix unsafe read during final iteration of zsymv_L_sse2.S 
							
						 
						
							2021-12-03 10:01:20 +01:00  
				
					
						
							
							
								 
						
							
								0de36f7b5c 
								
							 
						 
						
							
							
								
								trmm sve copy fucntions for single precision  
							
							
							
						 
						
							2021-11-29 21:25:05 +01:00  
				
					
						
							
							
								 
						
							
								c78fdcc80d 
								
							 
						 
						
							
							
								
								[POWER] Add support for SMALL_MATRIX_OPT  
							
							
							
						 
						
							2021-11-28 12:41:16 -06:00  
				
					
						
							
							
								 
						
							
								86ae89bf33 
								
							 
						 
						
							
							
								
								add sgemm kernel and copy functions for sgemm and ssymm  
							
							
							
						 
						
							2021-11-28 18:12:47 +01:00  
				
					
						
							
							
								 
						
							
								454edd741c 
								
							 
						 
						
							
							
								
								Merge pull request  #3425  from binebrank/arm_sve_dgemm  
							
							... 
							
							
							
							Add dgemm kernel for arm64 SVE 
							
						 
						
							2021-11-26 16:14:55 +01:00  
				
					
						
							
							
								 
						
							
								bcfbdc81b2 
								
							 
						 
						
							
							
								
								Merge pull request  #3459  from rafaelcfsousa/fix_cmake  
							
							... 
							
							
							
							Fix issues when building OpenBLAS with cmake 
							
						 
						
							2021-11-26 15:19:24 +01:00  
				
					
						
							
							
								 
						
							
								1af73ce38e 
								
							 
						 
						
							
							
								
								Adapt CMake for SVE  
							
							
							
						 
						
							2021-11-26 10:35:01 +01:00  
				
					
						
							
							
								 
						
							
								e7fca060db 
								
							 
						 
						
							
							
								
								Merge pull request  #3457  from wjc404/optimize-A53-dgemm  
							
							... 
							
							
							
							MOD: optimize zgemm on cortex-A53/cortex-A55 
							
						 
						
							2021-11-26 10:30:47 +01:00  
				
					
						
							
							
								 
						
							
								5c1cd5e0c2 
								
							 
						 
						
							
							
								
								MOD: add comments to a53 zgemm kernel  
							
							
							
						 
						
							2021-11-25 22:48:48 +08:00  
				
					
						
							
							
								 
						
							
								d5c9353f1b 
								
							 
						 
						
							
							
								
								Modify the order that cmake set the KERNEL variables (generic now is fallback)  
							
							
							
						 
						
							2021-11-24 20:08:35 -06:00  
				
					
						
							
							
								 
						
							
								9f59b19fcd 
								
							 
						 
						
							
							
								
								MOD: optimize zgemm on cortex-A53/cortex-A55  
							
							
							
						 
						
							2021-11-24 21:51:45 +08:00  
				
					
						
							
							
								 
						
							
								531a28b6a0 
								
							 
						 
						
							
							
								
								removed unused code (compiler warnings)  
							
							
							
						 
						
							2021-11-22 10:12:34 +01:00  
				
					
						
							
							
								 
						
							
								9b9cb90bb1 
								
							 
						 
						
							
							
								
								modify Makefile for SVE copy  
							
							
							
						 
						
							2021-11-22 09:54:20 +01:00  
				
					
						
							
							
								 
						
							
								9388f05a3c 
								
							 
						 
						
							
							
								
								configure SVE Makefile  
							
							
							
						 
						
							2021-11-21 18:33:43 +01:00  
				
					
						
							
							
								 
						
							
								b58d4f31ab 
								
							 
						 
						
							
							
								
								some clean-up & commentary  
							
							
							
						 
						
							2021-11-21 14:56:27 +01:00  
				
					
						
							
							
								 
						
							
								b7df500106 
								
							 
						 
						
							
							
								
								Add generic mips32 target  
							
							
							
						 
						
							2021-11-20 17:31:51 +01:00  
				
					
						
							
							
								 
						
							
								e6ed4be02e 
								
							 
						 
						
							
							
								
								symm SVE copy rutines  
							
							
							
						 
						
							2021-11-20 16:35:29 +01:00  
				
					
						
							
							
								 
						
							
								feeb8283a5 
								
							 
						 
						
							
							
								
								Fix unsafe read during final iteration of zsymv_L_sse2.S  
							
							
							
						 
						
							2021-11-19 14:29:32 -06:00  
				
					
						
							
							
								 
						
							
								302f22693a 
								
							 
						 
						
							
							
								
								MOD: optimize normal DGEMM on ARMV8 cortex-A53 & cortex-A55  
							
							
							
						 
						
							2021-11-18 21:14:43 +08:00  
				
					
						
							
							
								 
						
							
								3c7eed0e53 
								
							 
						 
						
							
							
								
								add remaining trmm copy rutines for SVE  
							
							
							
						 
						
							2021-11-14 16:00:10 +01:00  
				
					
						
							
							
								 
						
							
								7d996b1c36 
								
							 
						 
						
							
							
								
								dtrmm_utcopy sve function  
							
							
							
						 
						
							2021-11-13 18:48:53 +01:00  
				
					
						
							
							
								 
						
							
								ab7917910d 
								
							 
						 
						
							
							
								
								add v2x8 kernel + fix sve dtrmm  
							
							
							
						 
						
							2021-11-07 20:37:51 +01:00  
				
					
						
							
							
								 
						
							
								7093372e32 
								
							 
						 
						
							
							
								
								add ARMV8SVE target  
							
							
							
						 
						
							2021-11-01 22:53:21 +01:00  
				
					
						
							
							
								 
						
							
								a8fbdbac34 
								
							 
						 
						
							
							
								
								fix sve dgemm kernel + sve dtrmm  
							
							
							
						 
						
							2021-10-31 10:24:25 +01:00  
				
					
						
							
							
								 
						
							
								746b4f0f17 
								
							 
						 
						
							
							
								
								added SVE ncopy and tcopy  
							
							
							
						 
						
							2021-10-30 12:11:44 +02:00  
				
					
						
							
							
								 
						
							
								1a10d3e09d 
								
							 
						 
						
							
							
								
								add sve dgemm prototype  
							
							
							
						 
						
							2021-10-27 16:37:18 +02:00  
				
					
						
							
							
								 
						
							
								22bf5c27ba 
								
							 
						 
						
							
							
								
								Add basic support for the Fujitsu A64FX ( #3415 )  
							
							... 
							
							
							
							* Add initial support for Fujitsu A64FX as generic ARMV8 
							
						 
						
							2021-10-18 15:00:19 +02:00  
				
					
						
							
							
								 
						
							
								63a103ba6e 
								
							 
						 
						
							
							
								
								sbgemm: spr: disable small matrix path by default  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								82194ea9d2 
								
							 
						 
						
							
							
								
								sbgemm: spr: implement otcopy_16  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								8632380a96 
								
							 
						 
						
							
							
								
								sbgemm: spr: reuse ncopy_16 from cooperlake as incopy  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								6bc8204ce5 
								
							 
						 
						
							
							
								
								sbgemm: spr: optimization for tmp_c buffer  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								f018aa342a 
								
							 
						 
						
							
							
								
								sbgemm: spr: kernel handle alpha != 1.0  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								a52456b168 
								
							 
						 
						
							
							
								
								sbgemm: spr: oncopy: use tile load/store instead  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								f2485352a6 
								
							 
						 
						
							
							
								
								sbgemm: spr: only load A once in tail_k handling  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								9ab33228bb 
								
							 
						 
						
							
							
								
								sbgemm: spr: process k2 and odd k at the same time  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								10d52646e2 
								
							 
						 
						
							
							
								
								sbgemm: spr: oncopy: avoid handling too much pointer at a time  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								88154ed02d 
								
							 
						 
						
							
							
								
								sbgemm: spr: reduce tile conf loading by seperate tail k handling  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								a70bfb52d5 
								
							 
						 
						
							
							
								
								sbgemm: spr: kernel works for NN case when alpha is 1.0  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								6051c86741 
								
							 
						 
						
							
							
								
								sbgemm: spr: kernel works for m32 in NN case  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								d0b253ac6e 
								
							 
						 
						
							
							
								
								sbgemm: spr: implement oncopy_16  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								1d48b7cb16 
								
							 
						 
						
							
							
								
								sbgemm: spr: add dummy source files  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								3dc6052c7e 
								
							 
						 
						
							
							
								
								initial support for Sapphire Rapids platform  
							
							
							
						 
						
							2021-10-12 01:30:40 -07:00  
				
					
						
							
							
								 
						
							
								8c20ca345a 
								
							 
						 
						
							
							
								
								Use Neoverse's current mix of ThunderX2 kernels for Vortex as well  
							
							
							
						 
						
							2021-10-06 11:06:43 +02:00  
				
					
						
							
							
								 
						
							
								8e4c209002 
								
							 
						 
						
							
							
								
								Merge pull request  #3398  from kavanabhat/aix_p10_gnuas  
							
							... 
							
							
							
							Big Endian Changes for Power10 kernels 
							
						 
						
							2021-10-05 18:59:47 +02:00  
				
					
						
							
							
								 
						
							
								9cc95e5657 
								
							 
						 
						
							
							
								
								AIX changes for P10 with GNU Compiler  
							
							
							
						 
						
							2021-10-01 05:18:35 -05:00  
				
					
						
							
							
								 
						
							
								fe3c778c51 
								
							 
						 
						
							
							
								
								AIX changes for P10 with GNU Compiler  
							
							
							
						 
						
							2021-09-30 06:06:27 -05:00  
				
					
						
							
							
								 
						
							
								ee5ca8a328 
								
							 
						 
						
							
							
								
								x86_64: BFLOAT16: fix build warning  
							
							
							
						 
						
							2021-09-28 18:30:06 +08:00  
				
					
						
							
							
								 
						
							
								90cc944625 
								
							 
						 
						
							
							
								
								Move alphaI to x22 to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:53:18 +02:00  
				
					
						
							
							
								 
						
							
								590fbff06e 
								
							 
						 
						
							
							
								
								move alpha to x19/x20 to leave x18 unused for OSX  
							
							
							
						 
						
							2021-09-17 09:42:17 +02:00  
				
					
						
							
							
								 
						
							
								380940271b 
								
							 
						 
						
							
							
								
								Move temp to x21 to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:28:19 +02:00  
				
					
						
							
							
								 
						
							
								7d75177446 
								
							 
						 
						
							
							
								
								Move temp to x21 to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:24:11 +02:00  
				
					
						
							
							
								 
						
							
								0a4ac4b585 
								
							 
						 
						
							
							
								
								Use x21 for I to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:19:51 +02:00  
				
					
						
							
							
								 
						
							
								7d4a221579 
								
							 
						 
						
							
							
								
								Remove unused TEMP2 and reshuffle to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:18:25 +02:00  
				
					
						
							
							
								 
						
							
								d3a9c7ef7f 
								
							 
						 
						
							
							
								
								Merge pull request  #3382  from rafaelcfsousa/rafael/cwarnings  
							
							... 
							
							
							
							[POWER] Remove unused variable warnings. 
							
						 
						
							2021-09-17 09:15:16 +02:00  
				
					
						
							
							
								 
						
							
								8dfa61a61c 
								
							 
						 
						
							
							
								
								Initialize abs_mask1 with itself to silence a gcc warning  
							
							
							
						 
						
							2021-09-15 22:11:35 +02:00  
				
					
						
							
							
								 
						
							
								99aa10b3ff 
								
							 
						 
						
							
							
								
								Initialize abs_mask1 with itself to silence a gcc warning  
							
							... 
							
							
							
							actual initialization is via the _mm_cmpeq_ep18, which I've seen claimed to be the fastest way to set an xmm register to all 1s 
							
						 
						
							2021-09-15 22:10:43 +02:00  
				
					
						
							
							
								 
						
							
								b751edf624 
								
							 
						 
						
							
							
								
								Fix unused variable warnings on Power  
							
							
							
						 
						
							2021-09-15 13:36:07 -05:00  
				
					
						
							
							
								 
						
							
								80346b8813 
								
							 
						 
						
							
							
								
								Merge pull request  #3379  from martin-frbg/issue3369-2  
							
							... 
							
							
							
							Add casts to fix compiler warnings for SkylakeX sasum/dasum 
							
						 
						
							2021-09-15 07:18:57 +02:00  
				
					
						
							
							
								 
						
							
								ce036a2fc0 
								
							 
						 
						
							
							
								
								Add casts  
							
							
							
						 
						
							2021-09-14 21:41:53 +02:00  
				
					
						
							
							
								 
						
							
								ddf106f769 
								
							 
						 
						
							
							
								
								Add dedicated entries for BFLOAT16 kernels  
							
							
							
						 
						
							2021-09-14 16:17:18 +02:00  
				
					
						
							
							
								 
						
							
								af8843875a 
								
							 
						 
						
							
							
								
								Merge pull request  #3376  from martin-frbg/issue3370  
							
							... 
							
							
							
							Fix a few harmless compiler warnings 
							
						 
						
							2021-09-12 00:01:31 +02:00  
				
					
						
							
							
								 
						
							
								0925dfe2c9 
								
							 
						 
						
							
							
								
								One instance of kernel_4x1 is used even on SKX  
							
							
							
						 
						
							2021-09-11 15:30:19 +02:00  
				
					
						
							
							
								 
						
							
								7d873a329f 
								
							 
						 
						
							
							
								
								Add ifdefs around conditionally used functions  
							
							
							
						 
						
							2021-09-11 14:38:47 +02:00  
				
					
						
							
							
								 
						
							
								ef24712030 
								
							 
						 
						
							
							
								
								Move a conditionally used variable  
							
							
							
						 
						
							2021-09-11 14:37:44 +02:00  
				
					
						
							
							
								 
						
							
								d17238599b 
								
							 
						 
						
							
							
								
								Add casts  
							
							
							
						 
						
							2021-09-11 13:38:28 +02:00  
				
					
						
							
							
								 
						
							
								59a1114d03 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: tuning for small matrix  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								682d66555d 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: implement ncopy_16  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								beccb83b16 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: add n24 kernel for tcopy_4  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								5fcacad32b 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: implement tcopy_4  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								bb1c4fa5bd 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: prefetch A & B  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								7a2d1601ec 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: unroll core loop by 2  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								45fdf951b6 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: reorder ptr increase for performance  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								cece3541ab 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: fix bug in m64n12  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								9df0953cde 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: kernel works for NN  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								2ec9f3a8aa 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: change kernel size to 16x4  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								ef8f5fecc8 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: implement sbgemm_tcopy_32  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								4c294336e6 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: add dummy source files  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								f1e3305974 
								
							 
						 
						
							
							
								
								Add workaround for Windows10 macro name clash  
							
							
							
						 
						
							2021-09-01 21:36:50 +02:00  
				
					
						
							
							
								 
						
							
								619588fbab 
								
							 
						 
						
							
							
								
								sbgemm: remove unnecessary b0 files  
							
							
							
						 
						
							2021-08-30 17:55:01 +08:00  
				
					
						
							
							
								 
						
							
								f39301935c 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: make sure hot buffer aligned to 64  
							
							
							
						 
						
							2021-08-30 17:40:30 +08:00  
				
					
						
							
							
								 
						
							
								7d27b182fc 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: enable SBGEMM by small matrix path  
							
							
							
						 
						
							2021-08-30 17:40:30 +08:00  
				
					
						
							
							
								 
						
							
								1d83ca4bca 
								
							 
						 
						
							
							
								
								Small Matrix: support BFLOAT16 data type  
							
							
							
						 
						
							2021-08-30 17:40:20 +08:00  
				
					
						
							
							
								 
						
							
								bec9d9f63d 
								
							 
						 
						
							
							
								
								Merge pull request  #3335  from guowangy/small-matrix-latest  
							
							... 
							
							
							
							Add GEMM optimization for small matrix and single/double kernel for skylakex 
							
						 
						
							2021-08-29 22:33:33 +02:00  
				
					
						
							
							
								 
						
							
								dbbb39199f 
								
							 
						 
						
							
							
								
								sgemv: skylakex: fix build warning  
							
							
							
						 
						
							2021-08-25 07:13:00 +00:00  
				
					
						
							
							
								 
						
							
								e9acb46431 
								
							 
						 
						
							
							
								
								sgemv: skylakex: bug fix for sgemv_t kernel in corner case  
							
							
							
						 
						
							2021-08-25 07:07:27 +00:00  
				
					
						
							
							
								 
						
							
								f9dba63c28 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: remove unnecessary b0 source files  
							
							
							
						 
						
							2021-08-13 03:28:44 +00:00  
				
					
						
							
							
								 
						
							
								989e6bbdd3 
								
							 
						 
						
							
							
								
								Small Matrix: reduce generic kernel source files  
							
							
							
						 
						
							2021-08-13 03:17:38 +00:00  
				
					
						
							
							
								 
						
							
								04255be948 
								
							 
						 
						
							
							
								
								Merge pull request  #3344  from gxw-loongson/develop  
							
							... 
							
							
							
							Delete the macro instruction "li" and use "li.d" instead 
							
						 
						
							2021-08-12 15:16:46 +02:00  
				
					
						
							
							
								 
						
							
								a7bc8ec1f1 
								
							 
						 
						
							
							
								
								Delete the macro instruction "li" and use "li.d" instead  
							
							... 
							
							
							
							Change-Id: Icff7981e2eb7df29ba5af1f8eb5be8443c67450f 
							
						 
						
							2021-08-12 17:02:54 +08:00  
				
					
						
							
							
								 
						
							
								b06880c2cd 
								
							 
						 
						
							
							
								
								POWER10: Improving dasum performance  
							
							... 
							
							
							
							Unrolling a loop in dasum micro code to help in improving
POWER10 performance. 
							
						 
						
							2021-08-10 22:06:04 -05:00  
				
					
						
							
							
								 
						
							
								44d0032f3b 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: fix build error in old compiler  
							
							
							
						 
						
							2021-08-05 04:43:47 +00:00  
				
					
						
							
							
								 
						
							
								5d86becdae 
								
							 
						 
						
							
							
								
								Add all SBGEMM kernels for IA AVX512-BF16 based platforms  
							
							... 
							
							
							
							Added all SBGEMM kernels including NN/NT/TN/TT for both ColMajor and
RowMajor, based on AVX512-BF16 ISA set on IA.
Signed-off-by: Chen, Guobing <guobing.chen@intel.com> 
							
						 
						
							2021-08-05 11:11:29 +08:00  
				
					
						
							
							
								 
						
							
								fee5abd84b 
								
							 
						 
						
							
							
								
								Small Matrix: support cmake build  
							
							
							
						 
						
							2021-08-04 08:50:15 +00:00  
				
					
						
							
							
								 
						
							
								478d1086c1 
								
							 
						 
						
							
							
								
								Small Matrix: support DYNAMIC_ARCH build  
							
							
							
						 
						
							2021-08-04 03:12:41 +00:00  
				
					
						
							
							
								 
						
							
								6b58bca18b 
								
							 
						 
						
							
							
								
								Small Matrix: disable low performance default kernel  
							
							
							
						 
						
							2021-08-03 06:49:03 +00:00  
				
					
						
							
							
								 
						
							
								fa777f5517 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add DGEMM_SMALL_M_PERMIT and tune for TN kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								8592c21af4 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: dgemm nn: fix typo in idx load  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								3e79f6d89a 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm tn kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								323d7da4f7 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm tt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								f57fc932ac 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm nt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								91ec21202b 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm nn kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								72e070539c 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add sgemm tt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								02c6e764f2 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add SGEMM_SMALL_M_PERMIT and tune for TN kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								5dc7c3c8e5 
								
							 
						 
						
							
							
								
								Small Matrix: add GEMM_SMALL_MATRIX_PERMIT to tune small matrics case  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								642c393879 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add sgemm tn kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								ae3f5c737c 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nt: optimize for M < 12  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								0d72d75bf9 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add sgemm nt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								ca7682e3a3 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: fix n6 conflicts with n4  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								9967e61abb 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: fix error when beta not zero  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								a87736346f 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: add n6 to improve performance  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								4c9d9940fd 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: reduce store 4 N at a time  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								13b32f69b7 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: reduce store 4 M at a time  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								3d8c6d9607 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: clean up unused code  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								49b61a3f30 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm_nn: optimize for M <= 8  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								f88470323b 
								
							 
						 
						
							
							
								
								Optimize M < 16 using AVX512 mask  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								9186456a12 
								
							 
						 
						
							
							
								
								small matrix: SkylakeX: add SGEMM NN kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								6022e5629c 
								
							 
						 
						
							
							
								
								Refs  #2587  fix small matrix c/zgemm bug.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								57ed58cefe 
								
							 
						 
						
							
							
								
								Refs  #2587  Add small matrix optimization reference kernel for c/zgemm.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								17d32a4a82 
								
							 
						 
						
							
							
								
								Change a1b0 gemm to b0 gemm.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								59cb5de46b 
								
							 
						 
						
							
							
								
								Refs  #2587  Fix typos.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								be3349405d 
								
							 
						 
						
							
							
								
								Add alpha=1.0 beta=0.0 for small gemm.  
							
							
							
						 
						
							2021-08-02 07:01:47 +00:00  
				
					
						
							
							
								 
						
							
								0a2077901c 
								
							 
						 
						
							
							
								
								Add small marix optimization kernel interface.  
							
							... 
							
							
							
							make SMALL_MATRIX_OPT=1 
							
						 
						
							2021-08-02 07:01:47 +00:00  
				
					
						
							
							
								 
						
							
								0b8f7c8c10 
								
							 
						 
						
							
							
								
								Add cmake support for LOONGARCH64  
							
							
							
						 
						
							2021-08-02 10:00:41 +08:00  
				
					
						
							
							
								 
						
							
								af0a69f355 
								
							 
						 
						
							
							
								
								Add support for LOONGARCH64  
							
							
							
						 
						
							2021-07-27 15:29:12 +08:00  
				
					
						
							
							
								 
						
							
								49bbf330ca 
								
							 
						 
						
							
							
								
								Empirical workaround for numpy SVD NaN problem from issue 3318  
							
							
							
						 
						
							2021-07-18 22:19:19 +02:00  
				
					
						
							
							
								 
						
							
								5b4b385ecf 
								
							 
						 
						
							
							
								
								Temporarily disable the SkylakeX sgemv_t microkernel due to LAPACK testsuite failures  
							
							
							
						 
						
							2021-07-14 20:50:14 +02:00  
				
					
						
							
							
								 
						
							
								39ef0880ae 
								
							 
						 
						
							
							
								
								copy conf  
							
							
							
						 
						
							2021-06-19 21:49:58 +02:00  
				
					
						
							
							
								 
						
							
								c4b464cac6 
								
							 
						 
						
							
							
								
								Merge pull request  #3273  from austinpagan/sbgemm_gcc10_fix  
							
							... 
							
							
							
							Power10: Fix for SBGEMM 
							
						 
						
							2021-06-15 22:58:48 +02:00  
				
					
						
							
							
								 
						
							
								e6dd44d989 
								
							 
						 
						
							
							
								
								Power10: Fix for SBGEMM  
							
							... 
							
							
							
							While testing bfloat16 sbgemm kernel, there are some failures for odd value inputs due to updating result for
additional bytes. 
							
						 
						
							2021-06-15 13:07:47 -05:00  
				
					
						
							
							
								 
						
							
								9d292d37b2 
								
							 
						 
						
							
							
								
								arm64: add the missing d9 register to the clobber list  
							
							... 
							
							
							
							Refs. numpy/numpy#18422 
Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp> 
							
						 
						
							2021-06-14 17:01:28 +09:00  
				
					
						
							
							
								 
						
							
								2e8ff4a781 
								
							 
						 
						
							
							
								
								Merge pull request  #3266  from martin-frbg/powerparam  
							
							... 
							
							
							
							Remove spurious casts from PPC parameters and fix compilation for older targets 
							
						 
						
							2021-06-10 18:05:47 +02:00  
				
					
						
							
							
								 
						
							
								dbba381dc3 
								
							 
						 
						
							
							
								
								Merge pull request  #3260  from intelmy/sgemv_t_opt  
							
							... 
							
							
							
							Optimized sgemv_t for small N based on AVX512 
							
						 
						
							2021-06-10 16:08:24 +02:00  
				
					
						
							
							
								 
						
							
								efdbdd8f82 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:20:29 +02:00  
				
					
						
							
							
								 
						
							
								3906ef3b0f 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:19:40 +02:00  
				
					
						
							
							
								 
						
							
								8adf0971d8 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:18:22 +02:00  
				
					
						
							
							
								 
						
							
								08e2e60762 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:17:33 +02:00  
				
					
						
							
							
								 
						
							
								fb9e678235 
								
							 
						 
						
							
							
								
								Fix caxpy/zaxpy for big-endian  
							
							
							
						 
						
							2021-06-10 11:15:48 +02:00  
				
					
						
							
							
								 
						
							
								dc4fcb48df 
								
							 
						 
						
							
							
								
								Fix inverted conditional for caxpy/zaxpy  
							
							
							
						 
						
							2021-06-10 11:14:03 +02:00  
				
					
						
							
							
								 
						
							
								7a48247761 
								
							 
						 
						
							
							
								
								fix c/zrot and sgemv for POWER5  
							
							
							
						 
						
							2021-06-10 11:11:56 +02:00  
				
					
						
							
							
								 
						
							
								cbb70438df 
								
							 
						 
						
							
							
								
								POWER10: Fixes for sbgemm kernel  
							
							... 
							
							
							
							While testing bfloat16 sbgemm kernel, there are some failures
for odd value inputs due to array access beyond the boundary. 
							
						 
						
							2021-06-09 12:20:09 -05:00  
				
					
						
							
							
								 
						
							
								706a08d4a0 
								
							 
						 
						
							
							
								
								Optimized sgemv_t for small N based on AVX512  
							
							
							
						 
						
							2021-06-08 15:08:28 -04:00  
				
					
						
							
							
								 
						
							
								590be3fae3 
								
							 
						 
						
							
							
								
								riscv64: Add Makefile  
							
							
							
						 
						
							2021-06-07 22:55:56 +00:00  
				
					
						
							
							
								 
						
							
								3521cd48cb 
								
							 
						 
						
							
							
								
								RISCV64_GENERIC: Use generic kernel for DSDOT for better precision  
							
							... 
							
							
							
							The implementation in `riscv64/dot.c` fails the `test_dsdot` test, and
the generic kernel seems to have better precision. Tested on SiFive
FU740 (HiFive Unmatched) and QEMU.
Also see #1469 . 
							
						 
						
							2021-06-07 22:50:23 +00:00  
				
					
						
							
							
								 
						
							
								1e0192a5cc 
								
							 
						 
						
							
							
								
								riscv64/imin: Fix wrong comparison  
							
							... 
							
							
							
							Same as #1990 . 
							
						 
						
							2021-06-07 22:49:39 +00:00  
				
					
						
							
							
								 
						
							
								5f677e782e 
								
							 
						 
						
							
							
								
								Merge pull request  #3196  from guowangy/skylakex-gemm-batch-k  
							
							... 
							
							
							
							GEMM: skylake: improve the performance when m is small 
							
						 
						
							2021-05-22 19:25:28 +02:00  
				
					
						
							
							
								 
						
							
								02087a62e7 
								
							 
						 
						
							
							
								
								Merge pull request  #3205  from intelmy/sgemv_n_opt  
							
							... 
							
							
							
							optimize on sgemv_n for small n 
							
						 
						
							2021-05-17 17:49:01 +02:00  
				
					
						
							
							
								 
						
							
								4ecf631f95 
								
							 
						 
						
							
							
								
								Merge pull request  #3228  from martin-frbg/issue3226  
							
							... 
							
							
							
							filter out -mavx flag on Sandybridge zgemm/ztrmm kernels 
							
						 
						
							2021-05-15 09:06:12 +02:00  
				
					
						
							
							
								 
						
							
								310b76aad7 
								
							 
						 
						
							
							
								
								Merge pull request  #3231  from martin-frbg/issue3227  
							
							... 
							
							
							
							Support compilation with pre-C99 versions of MSVC 
							
						 
						
							2021-05-14 23:28:06 +02:00  
				
					
						
							
							
								 
						
							
								c4da892ba0 
								
							 
						 
						
							
							
								
								Only filter out -mavx on Sandybridge ZGEMM/ZTRMM kernels  
							
							
							
						 
						
							2021-05-14 23:19:10 +02:00  
				
					
						
							
							
								 
						
							
								8b90e5f202 
								
							 
						 
						
							
							
								
								Drop redundant inclusion of complex.h  
							
							
							
						 
						
							2021-05-14 15:06:44 +02:00  
				
					
						
							
							
								 
						
							
								bd60fb6ffc 
								
							 
						 
						
							
							
								
								filter out -mavx flag on zgemm kernels as it can cause problems with older gcc  
							
							
							
						 
						
							2021-05-13 23:05:00 +02:00  
				
					
						
							
							
								 
						
							
								37ea8702ee 
								
							 
						 
						
							
							
								
								Merge pull request  #3192  from damonyu1989/develop  
							
							... 
							
							
							
							Update the intrinsic api to the offical name. 
							
						 
						
							2021-05-11 16:00:45 +02:00  
				
					
						
							
							
								 
						
							
								c0ca63ea46 
								
							 
						 
						
							
							
								
								Fix missing conditionals for non-SKX kernels  
							
							
							
						 
						
							2021-05-05 14:55:36 +02:00  
				
					
						
							
							
								 
						
							
								3d4ccd2a13 
								
							 
						 
						
							
							
								
								fix for build error  
							
							
							
						 
						
							2021-04-30 12:25:33 -04:00  
				
					
						
							
							
								 
						
							
								c59652f0ce 
								
							 
						 
						
							
							
								
								optimize on sgemv_n for small n  
							
							
							
						 
						
							2021-04-30 12:14:58 -04:00  
				
					
						
							
							
								 
						
							
								aa7b3dc3db 
								
							 
						 
						
							
							
								
								GEMM: skylake: improve the performance when m is small  
							
							
							
						 
						
							2021-04-28 13:56:06 +00:00  
				
					
						
							
							
								 
						
							
								ceb44bef14 
								
							 
						 
						
							
							
								
								update the intrinsic api to the offical name.  
							
							
							
						 
						
							2021-04-27 11:12:29 +08:00  
				
					
						
							
							
								 
						
							
								3d511f0e66 
								
							 
						 
						
							
							
								
								replace spurious avx512 requirement with fma check  
							
							
							
						 
						
							2021-04-26 21:55:30 +02:00  
				
					
						
							
							
								 
						
							
								2379abaa5e 
								
							 
						 
						
							
							
								
								POWER10: Improve dgemm performance  
							
							... 
							
							
							
							This patch uses vector pair pointer for input load operation
which helps to generate power10 lxvp instructions. 
							
						 
						
							2021-04-13 22:30:06 -05:00  
				
					
						
							
							
								 
						
							
								55bb9f639a 
								
							 
						 
						
							
							
								
								POWER10: Optimized zgemv  
							
							... 
							
							
							
							This patch makes use of Matrix-Multiply Assist (MMA)
feature introduced in POWER ISA v3.1 for zgemv_n and zgemv_t. 
							
						 
						
							2021-04-10 19:00:24 -05:00  
				
					
						
							
							
								 
						
							
								2dfb24730d 
								
							 
						 
						
							
							
								
								Use "old" compute(24) function with clang due to register limitations  
							
							
							
						 
						
							2021-04-06 19:58:32 +02:00  
				
					
						
							
							
								 
						
							
								147e0a75fd 
								
							 
						 
						
							
							
								
								Merge pull request  #3170  from CodesWithWolves/sgemm_tcopy_16-invalid-read  
							
							... 
							
							
							
							Remove Unnecessary/Erroneous Adds/Reads In sgemm_tcopy_16.S COPY1x8 Macro 
							
						 
						
							2021-04-03 19:49:47 +02:00  
				
					
						
							
							
								 
						
							
								2dbcddd83d 
								
							 
						 
						
							
							
								
								POWER10:  Adding check for little endian  
							
							... 
							
							
							
							This patch makes sure that recent POWER10 patches are used
only for little endian. 
							
						 
						
							2021-03-31 21:32:42 -05:00  
				
					
						
							
							
								 
						
							
								d2bda3b56a 
								
							 
						 
						
							
							
								
								Remove Unnecessary/Erroneous Reads In sgemm_tcopy_16.S COPY1x8 Macro  
							
							... 
							
							
							
							There appears to have been some code leak when copying from the COPY2x8
macro above where we're reading 8 bytes into d4-d7 directly after
reading 4 bytes into s4-s7. These 32 bytes in d4-7 are unused and can
possibly overrun the boundary of allocated memory -- Valgrind detected
this which is what dragged my attention to it for a 128,1 copy.
Additionally, there is no need to update the addresses stored in A0-A7
as the only possible paths after running this macro will overwrite A0-7
if looping to the next 8 rows, or overwrite A0-3 if moving to 4 rows --
in which case A4-7 are unused. 
							
						 
						
							2021-03-31 15:44:25 -04:00  
				
					
						
							
							
								 
						
							
								bdd6e3a153 
								
							 
						 
						
							
							
								
								Merge pull request  #3157  from martin-frbg/issue3020-final  
							
							... 
							
							
							
							Add workaround for LAPACK testsuite failures with the NVIDIA HPC compiler on PPC 
							
						 
						
							2021-03-19 15:23:12 +01:00  
				
					
						
							
							
								 
						
							
								7b8f580941 
								
							 
						 
						
							
							
								
								Merge pull request  #3156  from martin-frbg/omatcopy_d  
							
							... 
							
							
							
							Move x86_64 DOMATCOPY_RT back to the C implementation 
							
						 
						
							2021-03-19 15:22:48 +01:00  
				
					
						
							
							
								 
						
							
								86c5a0013f 
								
							 
						 
						
							
							
								
								Add workaround for LAPACK testsuite failures with the NVIDIA HPC compiler  
							
							
							
						 
						
							2021-03-19 11:47:58 +01:00  
				
					
						
							
							
								 
						
							
								ef85c22474 
								
							 
						 
						
							
							
								
								Add workaround for LAPACK test failures with the NVIDIA HPC compiler  
							
							
							
						 
						
							2021-03-19 11:46:25 +01:00  
				
					
						
							
							
								 
						
							
								d3555d2e50 
								
							 
						 
						
							
							
								
								Add workaround for LAPACK test failures with the NVIDIA HPC compiler  
							
							
							
						 
						
							2021-03-19 11:44:31 +01:00  
				
					
						
							
							
								 
						
							
								0f5e86a0d9 
								
							 
						 
						
							
							
								
								Remove premature entry for DOMATCOPY_RT  
							
							
							
						 
						
							2021-03-18 21:53:50 +01:00  
				
					
						
							
							
								 
						
							
								7b294a99fd 
								
							 
						 
						
							
							
								
								Move common.h back to the top of the file so that SKYLAKEX (from config.h) is defined in time  
							
							
							
						 
						
							2021-03-18 21:28:19 +01:00  
				
					
						
							
							
								 
						
							
								0934568d9c 
								
							 
						 
						
							
							
								
								Move includes under the ifdef for compilers w/o intrinsics support  
							
							
							
						 
						
							2021-03-12 12:42:05 +01:00  
				
					
						
							
							
								 
						
							
								09d47af2c0 
								
							 
						 
						
							
							
								
								Optimize zscal function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-03-10 17:15:33 -06:00  
				
					
						
							
							
								 
						
							
								ef0238ba2b 
								
							 
						 
						
							
							
								
								Merge pull request  #3130  from martin-frbg/issue3128  
							
							... 
							
							
							
							Replace spurious AVX512 requirement in the Haswell srot microkernel with an AVX2/FMA3 guard 
							
						 
						
							2021-03-06 19:15:53 +01:00  
				
					
						
							
							
								 
						
							
								a9f6f7ad39 
								
							 
						 
						
							
							
								
								Remove spurious AVX512 requirement and add AVX2/FMA3 guard  
							
							
							
						 
						
							2021-03-06 14:35:49 +01:00  
				
					
						
							
							
								 
						
							
								41646ed006 
								
							 
						 
						
							
							
								
								Optimize s/dasum function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-03-05 16:22:36 -06:00