22bf5c27ba 
								
							 
						 
						
							
							
								
								Add basic support for the Fujitsu A64FX ( #3415 )  
							
							... 
							
							
							
							* Add initial support for Fujitsu A64FX as generic ARMV8 
							
						 
						
							2021-10-18 15:00:19 +02:00  
				
					
						
							
							
								 
						
							
								63a103ba6e 
								
							 
						 
						
							
							
								
								sbgemm: spr: disable small matrix path by default  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								82194ea9d2 
								
							 
						 
						
							
							
								
								sbgemm: spr: implement otcopy_16  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								8632380a96 
								
							 
						 
						
							
							
								
								sbgemm: spr: reuse ncopy_16 from cooperlake as incopy  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								6bc8204ce5 
								
							 
						 
						
							
							
								
								sbgemm: spr: optimization for tmp_c buffer  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								f018aa342a 
								
							 
						 
						
							
							
								
								sbgemm: spr: kernel handle alpha != 1.0  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								a52456b168 
								
							 
						 
						
							
							
								
								sbgemm: spr: oncopy: use tile load/store instead  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								f2485352a6 
								
							 
						 
						
							
							
								
								sbgemm: spr: only load A once in tail_k handling  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								9ab33228bb 
								
							 
						 
						
							
							
								
								sbgemm: spr: process k2 and odd k at the same time  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								10d52646e2 
								
							 
						 
						
							
							
								
								sbgemm: spr: oncopy: avoid handling too much pointer at a time  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								88154ed02d 
								
							 
						 
						
							
							
								
								sbgemm: spr: reduce tile conf loading by seperate tail k handling  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								a70bfb52d5 
								
							 
						 
						
							
							
								
								sbgemm: spr: kernel works for NN case when alpha is 1.0  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								6051c86741 
								
							 
						 
						
							
							
								
								sbgemm: spr: kernel works for m32 in NN case  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								d0b253ac6e 
								
							 
						 
						
							
							
								
								sbgemm: spr: implement oncopy_16  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								1d48b7cb16 
								
							 
						 
						
							
							
								
								sbgemm: spr: add dummy source files  
							
							
							
						 
						
							2021-10-17 19:08:03 -07:00  
				
					
						
							
							
								 
						
							
								3dc6052c7e 
								
							 
						 
						
							
							
								
								initial support for Sapphire Rapids platform  
							
							
							
						 
						
							2021-10-12 01:30:40 -07:00  
				
					
						
							
							
								 
						
							
								8c20ca345a 
								
							 
						 
						
							
							
								
								Use Neoverse's current mix of ThunderX2 kernels for Vortex as well  
							
							
							
						 
						
							2021-10-06 11:06:43 +02:00  
				
					
						
							
							
								 
						
							
								8e4c209002 
								
							 
						 
						
							
							
								
								Merge pull request  #3398  from kavanabhat/aix_p10_gnuas  
							
							... 
							
							
							
							Big Endian Changes for Power10 kernels 
							
						 
						
							2021-10-05 18:59:47 +02:00  
				
					
						
							
							
								 
						
							
								9cc95e5657 
								
							 
						 
						
							
							
								
								AIX changes for P10 with GNU Compiler  
							
							
							
						 
						
							2021-10-01 05:18:35 -05:00  
				
					
						
							
							
								 
						
							
								fe3c778c51 
								
							 
						 
						
							
							
								
								AIX changes for P10 with GNU Compiler  
							
							
							
						 
						
							2021-09-30 06:06:27 -05:00  
				
					
						
							
							
								 
						
							
								ee5ca8a328 
								
							 
						 
						
							
							
								
								x86_64: BFLOAT16: fix build warning  
							
							
							
						 
						
							2021-09-28 18:30:06 +08:00  
				
					
						
							
							
								 
						
							
								90cc944625 
								
							 
						 
						
							
							
								
								Move alphaI to x22 to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:53:18 +02:00  
				
					
						
							
							
								 
						
							
								590fbff06e 
								
							 
						 
						
							
							
								
								move alpha to x19/x20 to leave x18 unused for OSX  
							
							
							
						 
						
							2021-09-17 09:42:17 +02:00  
				
					
						
							
							
								 
						
							
								380940271b 
								
							 
						 
						
							
							
								
								Move temp to x21 to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:28:19 +02:00  
				
					
						
							
							
								 
						
							
								7d75177446 
								
							 
						 
						
							
							
								
								Move temp to x21 to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:24:11 +02:00  
				
					
						
							
							
								 
						
							
								0a4ac4b585 
								
							 
						 
						
							
							
								
								Use x21 for I to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:19:51 +02:00  
				
					
						
							
							
								 
						
							
								7d4a221579 
								
							 
						 
						
							
							
								
								Remove unused TEMP2 and reshuffle to leave x18 unused (reserved on OSX)  
							
							
							
						 
						
							2021-09-17 09:18:25 +02:00  
				
					
						
							
							
								 
						
							
								d3a9c7ef7f 
								
							 
						 
						
							
							
								
								Merge pull request  #3382  from rafaelcfsousa/rafael/cwarnings  
							
							... 
							
							
							
							[POWER] Remove unused variable warnings. 
							
						 
						
							2021-09-17 09:15:16 +02:00  
				
					
						
							
							
								 
						
							
								8dfa61a61c 
								
							 
						 
						
							
							
								
								Initialize abs_mask1 with itself to silence a gcc warning  
							
							
							
						 
						
							2021-09-15 22:11:35 +02:00  
				
					
						
							
							
								 
						
							
								99aa10b3ff 
								
							 
						 
						
							
							
								
								Initialize abs_mask1 with itself to silence a gcc warning  
							
							... 
							
							
							
							actual initialization is via the _mm_cmpeq_ep18, which I've seen claimed to be the fastest way to set an xmm register to all 1s 
							
						 
						
							2021-09-15 22:10:43 +02:00  
				
					
						
							
							
								 
						
							
								b751edf624 
								
							 
						 
						
							
							
								
								Fix unused variable warnings on Power  
							
							
							
						 
						
							2021-09-15 13:36:07 -05:00  
				
					
						
							
							
								 
						
							
								80346b8813 
								
							 
						 
						
							
							
								
								Merge pull request  #3379  from martin-frbg/issue3369-2  
							
							... 
							
							
							
							Add casts to fix compiler warnings for SkylakeX sasum/dasum 
							
						 
						
							2021-09-15 07:18:57 +02:00  
				
					
						
							
							
								 
						
							
								ce036a2fc0 
								
							 
						 
						
							
							
								
								Add casts  
							
							
							
						 
						
							2021-09-14 21:41:53 +02:00  
				
					
						
							
							
								 
						
							
								ddf106f769 
								
							 
						 
						
							
							
								
								Add dedicated entries for BFLOAT16 kernels  
							
							
							
						 
						
							2021-09-14 16:17:18 +02:00  
				
					
						
							
							
								 
						
							
								af8843875a 
								
							 
						 
						
							
							
								
								Merge pull request  #3376  from martin-frbg/issue3370  
							
							... 
							
							
							
							Fix a few harmless compiler warnings 
							
						 
						
							2021-09-12 00:01:31 +02:00  
				
					
						
							
							
								 
						
							
								0925dfe2c9 
								
							 
						 
						
							
							
								
								One instance of kernel_4x1 is used even on SKX  
							
							
							
						 
						
							2021-09-11 15:30:19 +02:00  
				
					
						
							
							
								 
						
							
								7d873a329f 
								
							 
						 
						
							
							
								
								Add ifdefs around conditionally used functions  
							
							
							
						 
						
							2021-09-11 14:38:47 +02:00  
				
					
						
							
							
								 
						
							
								ef24712030 
								
							 
						 
						
							
							
								
								Move a conditionally used variable  
							
							
							
						 
						
							2021-09-11 14:37:44 +02:00  
				
					
						
							
							
								 
						
							
								d17238599b 
								
							 
						 
						
							
							
								
								Add casts  
							
							
							
						 
						
							2021-09-11 13:38:28 +02:00  
				
					
						
							
							
								 
						
							
								59a1114d03 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: tuning for small matrix  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								682d66555d 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: implement ncopy_16  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								beccb83b16 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: add n24 kernel for tcopy_4  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								5fcacad32b 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: implement tcopy_4  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								bb1c4fa5bd 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: prefetch A & B  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								7a2d1601ec 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: unroll core loop by 2  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								45fdf951b6 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: reorder ptr increase for performance  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								cece3541ab 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: fix bug in m64n12  
							
							
							
						 
						
							2021-09-07 21:30:46 +08:00  
				
					
						
							
							
								 
						
							
								9df0953cde 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: kernel works for NN  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								2ec9f3a8aa 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: change kernel size to 16x4  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								ef8f5fecc8 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: implement sbgemm_tcopy_32  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								4c294336e6 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: add dummy source files  
							
							
							
						 
						
							2021-09-07 21:30:45 +08:00  
				
					
						
							
							
								 
						
							
								f1e3305974 
								
							 
						 
						
							
							
								
								Add workaround for Windows10 macro name clash  
							
							
							
						 
						
							2021-09-01 21:36:50 +02:00  
				
					
						
							
							
								 
						
							
								619588fbab 
								
							 
						 
						
							
							
								
								sbgemm: remove unnecessary b0 files  
							
							
							
						 
						
							2021-08-30 17:55:01 +08:00  
				
					
						
							
							
								 
						
							
								f39301935c 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: make sure hot buffer aligned to 64  
							
							
							
						 
						
							2021-08-30 17:40:30 +08:00  
				
					
						
							
							
								 
						
							
								7d27b182fc 
								
							 
						 
						
							
							
								
								sbgemm: cooperlake: enable SBGEMM by small matrix path  
							
							
							
						 
						
							2021-08-30 17:40:30 +08:00  
				
					
						
							
							
								 
						
							
								1d83ca4bca 
								
							 
						 
						
							
							
								
								Small Matrix: support BFLOAT16 data type  
							
							
							
						 
						
							2021-08-30 17:40:20 +08:00  
				
					
						
							
							
								 
						
							
								bec9d9f63d 
								
							 
						 
						
							
							
								
								Merge pull request  #3335  from guowangy/small-matrix-latest  
							
							... 
							
							
							
							Add GEMM optimization for small matrix and single/double kernel for skylakex 
							
						 
						
							2021-08-29 22:33:33 +02:00  
				
					
						
							
							
								 
						
							
								dbbb39199f 
								
							 
						 
						
							
							
								
								sgemv: skylakex: fix build warning  
							
							
							
						 
						
							2021-08-25 07:13:00 +00:00  
				
					
						
							
							
								 
						
							
								e9acb46431 
								
							 
						 
						
							
							
								
								sgemv: skylakex: bug fix for sgemv_t kernel in corner case  
							
							
							
						 
						
							2021-08-25 07:07:27 +00:00  
				
					
						
							
							
								 
						
							
								f9dba63c28 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: remove unnecessary b0 source files  
							
							
							
						 
						
							2021-08-13 03:28:44 +00:00  
				
					
						
							
							
								 
						
							
								989e6bbdd3 
								
							 
						 
						
							
							
								
								Small Matrix: reduce generic kernel source files  
							
							
							
						 
						
							2021-08-13 03:17:38 +00:00  
				
					
						
							
							
								 
						
							
								04255be948 
								
							 
						 
						
							
							
								
								Merge pull request  #3344  from gxw-loongson/develop  
							
							... 
							
							
							
							Delete the macro instruction "li" and use "li.d" instead 
							
						 
						
							2021-08-12 15:16:46 +02:00  
				
					
						
							
							
								 
						
							
								a7bc8ec1f1 
								
							 
						 
						
							
							
								
								Delete the macro instruction "li" and use "li.d" instead  
							
							... 
							
							
							
							Change-Id: Icff7981e2eb7df29ba5af1f8eb5be8443c67450f 
							
						 
						
							2021-08-12 17:02:54 +08:00  
				
					
						
							
							
								 
						
							
								b06880c2cd 
								
							 
						 
						
							
							
								
								POWER10: Improving dasum performance  
							
							... 
							
							
							
							Unrolling a loop in dasum micro code to help in improving
POWER10 performance. 
							
						 
						
							2021-08-10 22:06:04 -05:00  
				
					
						
							
							
								 
						
							
								44d0032f3b 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: fix build error in old compiler  
							
							
							
						 
						
							2021-08-05 04:43:47 +00:00  
				
					
						
							
							
								 
						
							
								5d86becdae 
								
							 
						 
						
							
							
								
								Add all SBGEMM kernels for IA AVX512-BF16 based platforms  
							
							... 
							
							
							
							Added all SBGEMM kernels including NN/NT/TN/TT for both ColMajor and
RowMajor, based on AVX512-BF16 ISA set on IA.
Signed-off-by: Chen, Guobing <guobing.chen@intel.com> 
							
						 
						
							2021-08-05 11:11:29 +08:00  
				
					
						
							
							
								 
						
							
								fee5abd84b 
								
							 
						 
						
							
							
								
								Small Matrix: support cmake build  
							
							
							
						 
						
							2021-08-04 08:50:15 +00:00  
				
					
						
							
							
								 
						
							
								478d1086c1 
								
							 
						 
						
							
							
								
								Small Matrix: support DYNAMIC_ARCH build  
							
							
							
						 
						
							2021-08-04 03:12:41 +00:00  
				
					
						
							
							
								 
						
							
								6b58bca18b 
								
							 
						 
						
							
							
								
								Small Matrix: disable low performance default kernel  
							
							
							
						 
						
							2021-08-03 06:49:03 +00:00  
				
					
						
							
							
								 
						
							
								fa777f5517 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add DGEMM_SMALL_M_PERMIT and tune for TN kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								8592c21af4 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: dgemm nn: fix typo in idx load  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								3e79f6d89a 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm tn kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								323d7da4f7 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm tt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								f57fc932ac 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm nt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								91ec21202b 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add dgemm nn kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								72e070539c 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add sgemm tt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								02c6e764f2 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add SGEMM_SMALL_M_PERMIT and tune for TN kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								5dc7c3c8e5 
								
							 
						 
						
							
							
								
								Small Matrix: add GEMM_SMALL_MATRIX_PERMIT to tune small matrics case  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								642c393879 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add sgemm tn kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								ae3f5c737c 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nt: optimize for M < 12  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								0d72d75bf9 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: add sgemm nt kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								ca7682e3a3 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: fix n6 conflicts with n4  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								9967e61abb 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: fix error when beta not zero  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								a87736346f 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: add n6 to improve performance  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								4c9d9940fd 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: reduce store 4 N at a time  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								13b32f69b7 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: reduce store 4 M at a time  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								3d8c6d9607 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm nn: clean up unused code  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								49b61a3f30 
								
							 
						 
						
							
							
								
								Small Matrix: skylakex: sgemm_nn: optimize for M <= 8  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								f88470323b 
								
							 
						 
						
							
							
								
								Optimize M < 16 using AVX512 mask  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								9186456a12 
								
							 
						 
						
							
							
								
								small matrix: SkylakeX: add SGEMM NN kernel  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								6022e5629c 
								
							 
						 
						
							
							
								
								Refs  #2587  fix small matrix c/zgemm bug.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								57ed58cefe 
								
							 
						 
						
							
							
								
								Refs  #2587  Add small matrix optimization reference kernel for c/zgemm.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								17d32a4a82 
								
							 
						 
						
							
							
								
								Change a1b0 gemm to b0 gemm.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								59cb5de46b 
								
							 
						 
						
							
							
								
								Refs  #2587  Fix typos.  
							
							
							
						 
						
							2021-08-02 07:06:54 +00:00  
				
					
						
							
							
								 
						
							
								be3349405d 
								
							 
						 
						
							
							
								
								Add alpha=1.0 beta=0.0 for small gemm.  
							
							
							
						 
						
							2021-08-02 07:01:47 +00:00  
				
					
						
							
							
								 
						
							
								0a2077901c 
								
							 
						 
						
							
							
								
								Add small marix optimization kernel interface.  
							
							... 
							
							
							
							make SMALL_MATRIX_OPT=1 
							
						 
						
							2021-08-02 07:01:47 +00:00  
				
					
						
							
							
								 
						
							
								0b8f7c8c10 
								
							 
						 
						
							
							
								
								Add cmake support for LOONGARCH64  
							
							
							
						 
						
							2021-08-02 10:00:41 +08:00  
				
					
						
							
							
								 
						
							
								af0a69f355 
								
							 
						 
						
							
							
								
								Add support for LOONGARCH64  
							
							
							
						 
						
							2021-07-27 15:29:12 +08:00  
				
					
						
							
							
								 
						
							
								49bbf330ca 
								
							 
						 
						
							
							
								
								Empirical workaround for numpy SVD NaN problem from issue 3318  
							
							
							
						 
						
							2021-07-18 22:19:19 +02:00  
				
					
						
							
							
								 
						
							
								5b4b385ecf 
								
							 
						 
						
							
							
								
								Temporarily disable the SkylakeX sgemv_t microkernel due to LAPACK testsuite failures  
							
							
							
						 
						
							2021-07-14 20:50:14 +02:00  
				
					
						
							
							
								 
						
							
								39ef0880ae 
								
							 
						 
						
							
							
								
								copy conf  
							
							
							
						 
						
							2021-06-19 21:49:58 +02:00  
				
					
						
							
							
								 
						
							
								c4b464cac6 
								
							 
						 
						
							
							
								
								Merge pull request  #3273  from austinpagan/sbgemm_gcc10_fix  
							
							... 
							
							
							
							Power10: Fix for SBGEMM 
							
						 
						
							2021-06-15 22:58:48 +02:00  
				
					
						
							
							
								 
						
							
								e6dd44d989 
								
							 
						 
						
							
							
								
								Power10: Fix for SBGEMM  
							
							... 
							
							
							
							While testing bfloat16 sbgemm kernel, there are some failures for odd value inputs due to updating result for
additional bytes. 
							
						 
						
							2021-06-15 13:07:47 -05:00  
				
					
						
							
							
								 
						
							
								9d292d37b2 
								
							 
						 
						
							
							
								
								arm64: add the missing d9 register to the clobber list  
							
							... 
							
							
							
							Refs. numpy/numpy#18422 
Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp> 
							
						 
						
							2021-06-14 17:01:28 +09:00  
				
					
						
							
							
								 
						
							
								2e8ff4a781 
								
							 
						 
						
							
							
								
								Merge pull request  #3266  from martin-frbg/powerparam  
							
							... 
							
							
							
							Remove spurious casts from PPC parameters and fix compilation for older targets 
							
						 
						
							2021-06-10 18:05:47 +02:00  
				
					
						
							
							
								 
						
							
								dbba381dc3 
								
							 
						 
						
							
							
								
								Merge pull request  #3260  from intelmy/sgemv_t_opt  
							
							... 
							
							
							
							Optimized sgemv_t for small N based on AVX512 
							
						 
						
							2021-06-10 16:08:24 +02:00  
				
					
						
							
							
								 
						
							
								efdbdd8f82 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:20:29 +02:00  
				
					
						
							
							
								 
						
							
								3906ef3b0f 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:19:40 +02:00  
				
					
						
							
							
								 
						
							
								8adf0971d8 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:18:22 +02:00  
				
					
						
							
							
								 
						
							
								08e2e60762 
								
							 
						 
						
							
							
								
								Add prefetch values for power3  
							
							
							
						 
						
							2021-06-10 11:17:33 +02:00  
				
					
						
							
							
								 
						
							
								fb9e678235 
								
							 
						 
						
							
							
								
								Fix caxpy/zaxpy for big-endian  
							
							
							
						 
						
							2021-06-10 11:15:48 +02:00  
				
					
						
							
							
								 
						
							
								dc4fcb48df 
								
							 
						 
						
							
							
								
								Fix inverted conditional for caxpy/zaxpy  
							
							
							
						 
						
							2021-06-10 11:14:03 +02:00  
				
					
						
							
							
								 
						
							
								7a48247761 
								
							 
						 
						
							
							
								
								fix c/zrot and sgemv for POWER5  
							
							
							
						 
						
							2021-06-10 11:11:56 +02:00  
				
					
						
							
							
								 
						
							
								cbb70438df 
								
							 
						 
						
							
							
								
								POWER10: Fixes for sbgemm kernel  
							
							... 
							
							
							
							While testing bfloat16 sbgemm kernel, there are some failures
for odd value inputs due to array access beyond the boundary. 
							
						 
						
							2021-06-09 12:20:09 -05:00  
				
					
						
							
							
								 
						
							
								706a08d4a0 
								
							 
						 
						
							
							
								
								Optimized sgemv_t for small N based on AVX512  
							
							
							
						 
						
							2021-06-08 15:08:28 -04:00  
				
					
						
							
							
								 
						
							
								590be3fae3 
								
							 
						 
						
							
							
								
								riscv64: Add Makefile  
							
							
							
						 
						
							2021-06-07 22:55:56 +00:00  
				
					
						
							
							
								 
						
							
								3521cd48cb 
								
							 
						 
						
							
							
								
								RISCV64_GENERIC: Use generic kernel for DSDOT for better precision  
							
							... 
							
							
							
							The implementation in `riscv64/dot.c` fails the `test_dsdot` test, and
the generic kernel seems to have better precision. Tested on SiFive
FU740 (HiFive Unmatched) and QEMU.
Also see #1469 . 
							
						 
						
							2021-06-07 22:50:23 +00:00  
				
					
						
							
							
								 
						
							
								1e0192a5cc 
								
							 
						 
						
							
							
								
								riscv64/imin: Fix wrong comparison  
							
							... 
							
							
							
							Same as #1990 . 
							
						 
						
							2021-06-07 22:49:39 +00:00  
				
					
						
							
							
								 
						
							
								5f677e782e 
								
							 
						 
						
							
							
								
								Merge pull request  #3196  from guowangy/skylakex-gemm-batch-k  
							
							... 
							
							
							
							GEMM: skylake: improve the performance when m is small 
							
						 
						
							2021-05-22 19:25:28 +02:00  
				
					
						
							
							
								 
						
							
								02087a62e7 
								
							 
						 
						
							
							
								
								Merge pull request  #3205  from intelmy/sgemv_n_opt  
							
							... 
							
							
							
							optimize on sgemv_n for small n 
							
						 
						
							2021-05-17 17:49:01 +02:00  
				
					
						
							
							
								 
						
							
								4ecf631f95 
								
							 
						 
						
							
							
								
								Merge pull request  #3228  from martin-frbg/issue3226  
							
							... 
							
							
							
							filter out -mavx flag on Sandybridge zgemm/ztrmm kernels 
							
						 
						
							2021-05-15 09:06:12 +02:00  
				
					
						
							
							
								 
						
							
								310b76aad7 
								
							 
						 
						
							
							
								
								Merge pull request  #3231  from martin-frbg/issue3227  
							
							... 
							
							
							
							Support compilation with pre-C99 versions of MSVC 
							
						 
						
							2021-05-14 23:28:06 +02:00  
				
					
						
							
							
								 
						
							
								c4da892ba0 
								
							 
						 
						
							
							
								
								Only filter out -mavx on Sandybridge ZGEMM/ZTRMM kernels  
							
							
							
						 
						
							2021-05-14 23:19:10 +02:00  
				
					
						
							
							
								 
						
							
								8b90e5f202 
								
							 
						 
						
							
							
								
								Drop redundant inclusion of complex.h  
							
							
							
						 
						
							2021-05-14 15:06:44 +02:00  
				
					
						
							
							
								 
						
							
								bd60fb6ffc 
								
							 
						 
						
							
							
								
								filter out -mavx flag on zgemm kernels as it can cause problems with older gcc  
							
							
							
						 
						
							2021-05-13 23:05:00 +02:00  
				
					
						
							
							
								 
						
							
								37ea8702ee 
								
							 
						 
						
							
							
								
								Merge pull request  #3192  from damonyu1989/develop  
							
							... 
							
							
							
							Update the intrinsic api to the offical name. 
							
						 
						
							2021-05-11 16:00:45 +02:00  
				
					
						
							
							
								 
						
							
								c0ca63ea46 
								
							 
						 
						
							
							
								
								Fix missing conditionals for non-SKX kernels  
							
							
							
						 
						
							2021-05-05 14:55:36 +02:00  
				
					
						
							
							
								 
						
							
								3d4ccd2a13 
								
							 
						 
						
							
							
								
								fix for build error  
							
							
							
						 
						
							2021-04-30 12:25:33 -04:00  
				
					
						
							
							
								 
						
							
								c59652f0ce 
								
							 
						 
						
							
							
								
								optimize on sgemv_n for small n  
							
							
							
						 
						
							2021-04-30 12:14:58 -04:00  
				
					
						
							
							
								 
						
							
								aa7b3dc3db 
								
							 
						 
						
							
							
								
								GEMM: skylake: improve the performance when m is small  
							
							
							
						 
						
							2021-04-28 13:56:06 +00:00  
				
					
						
							
							
								 
						
							
								ceb44bef14 
								
							 
						 
						
							
							
								
								update the intrinsic api to the offical name.  
							
							
							
						 
						
							2021-04-27 11:12:29 +08:00  
				
					
						
							
							
								 
						
							
								3d511f0e66 
								
							 
						 
						
							
							
								
								replace spurious avx512 requirement with fma check  
							
							
							
						 
						
							2021-04-26 21:55:30 +02:00  
				
					
						
							
							
								 
						
							
								2379abaa5e 
								
							 
						 
						
							
							
								
								POWER10: Improve dgemm performance  
							
							... 
							
							
							
							This patch uses vector pair pointer for input load operation
which helps to generate power10 lxvp instructions. 
							
						 
						
							2021-04-13 22:30:06 -05:00  
				
					
						
							
							
								 
						
							
								55bb9f639a 
								
							 
						 
						
							
							
								
								POWER10: Optimized zgemv  
							
							... 
							
							
							
							This patch makes use of Matrix-Multiply Assist (MMA)
feature introduced in POWER ISA v3.1 for zgemv_n and zgemv_t. 
							
						 
						
							2021-04-10 19:00:24 -05:00  
				
					
						
							
							
								 
						
							
								2dfb24730d 
								
							 
						 
						
							
							
								
								Use "old" compute(24) function with clang due to register limitations  
							
							
							
						 
						
							2021-04-06 19:58:32 +02:00  
				
					
						
							
							
								 
						
							
								147e0a75fd 
								
							 
						 
						
							
							
								
								Merge pull request  #3170  from CodesWithWolves/sgemm_tcopy_16-invalid-read  
							
							... 
							
							
							
							Remove Unnecessary/Erroneous Adds/Reads In sgemm_tcopy_16.S COPY1x8 Macro 
							
						 
						
							2021-04-03 19:49:47 +02:00  
				
					
						
							
							
								 
						
							
								2dbcddd83d 
								
							 
						 
						
							
							
								
								POWER10:  Adding check for little endian  
							
							... 
							
							
							
							This patch makes sure that recent POWER10 patches are used
only for little endian. 
							
						 
						
							2021-03-31 21:32:42 -05:00  
				
					
						
							
							
								 
						
							
								d2bda3b56a 
								
							 
						 
						
							
							
								
								Remove Unnecessary/Erroneous Reads In sgemm_tcopy_16.S COPY1x8 Macro  
							
							... 
							
							
							
							There appears to have been some code leak when copying from the COPY2x8
macro above where we're reading 8 bytes into d4-d7 directly after
reading 4 bytes into s4-s7. These 32 bytes in d4-7 are unused and can
possibly overrun the boundary of allocated memory -- Valgrind detected
this which is what dragged my attention to it for a 128,1 copy.
Additionally, there is no need to update the addresses stored in A0-A7
as the only possible paths after running this macro will overwrite A0-7
if looping to the next 8 rows, or overwrite A0-3 if moving to 4 rows --
in which case A4-7 are unused. 
							
						 
						
							2021-03-31 15:44:25 -04:00  
				
					
						
							
							
								 
						
							
								bdd6e3a153 
								
							 
						 
						
							
							
								
								Merge pull request  #3157  from martin-frbg/issue3020-final  
							
							... 
							
							
							
							Add workaround for LAPACK testsuite failures with the NVIDIA HPC compiler on PPC 
							
						 
						
							2021-03-19 15:23:12 +01:00  
				
					
						
							
							
								 
						
							
								7b8f580941 
								
							 
						 
						
							
							
								
								Merge pull request  #3156  from martin-frbg/omatcopy_d  
							
							... 
							
							
							
							Move x86_64 DOMATCOPY_RT back to the C implementation 
							
						 
						
							2021-03-19 15:22:48 +01:00  
				
					
						
							
							
								 
						
							
								86c5a0013f 
								
							 
						 
						
							
							
								
								Add workaround for LAPACK testsuite failures with the NVIDIA HPC compiler  
							
							
							
						 
						
							2021-03-19 11:47:58 +01:00  
				
					
						
							
							
								 
						
							
								ef85c22474 
								
							 
						 
						
							
							
								
								Add workaround for LAPACK test failures with the NVIDIA HPC compiler  
							
							
							
						 
						
							2021-03-19 11:46:25 +01:00  
				
					
						
							
							
								 
						
							
								d3555d2e50 
								
							 
						 
						
							
							
								
								Add workaround for LAPACK test failures with the NVIDIA HPC compiler  
							
							
							
						 
						
							2021-03-19 11:44:31 +01:00  
				
					
						
							
							
								 
						
							
								0f5e86a0d9 
								
							 
						 
						
							
							
								
								Remove premature entry for DOMATCOPY_RT  
							
							
							
						 
						
							2021-03-18 21:53:50 +01:00  
				
					
						
							
							
								 
						
							
								7b294a99fd 
								
							 
						 
						
							
							
								
								Move common.h back to the top of the file so that SKYLAKEX (from config.h) is defined in time  
							
							
							
						 
						
							2021-03-18 21:28:19 +01:00  
				
					
						
							
							
								 
						
							
								0934568d9c 
								
							 
						 
						
							
							
								
								Move includes under the ifdef for compilers w/o intrinsics support  
							
							
							
						 
						
							2021-03-12 12:42:05 +01:00  
				
					
						
							
							
								 
						
							
								09d47af2c0 
								
							 
						 
						
							
							
								
								Optimize zscal function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-03-10 17:15:33 -06:00  
				
					
						
							
							
								 
						
							
								ef0238ba2b 
								
							 
						 
						
							
							
								
								Merge pull request  #3130  from martin-frbg/issue3128  
							
							... 
							
							
							
							Replace spurious AVX512 requirement in the Haswell srot microkernel with an AVX2/FMA3 guard 
							
						 
						
							2021-03-06 19:15:53 +01:00  
				
					
						
							
							
								 
						
							
								a9f6f7ad39 
								
							 
						 
						
							
							
								
								Remove spurious AVX512 requirement and add AVX2/FMA3 guard  
							
							
							
						 
						
							2021-03-06 14:35:49 +01:00  
				
					
						
							
							
								 
						
							
								41646ed006 
								
							 
						 
						
							
							
								
								Optimize s/dasum function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-03-05 16:22:36 -06:00  
				
					
						
							
							
								 
						
							
								0571c3187b 
								
							 
						 
						
							
							
								
								POWER10: Rename mma builtins  
							
							... 
							
							
							
							The LLVM and GCC teams agreed to rename the __builtin_mma_assemble_pair and
__builtin_mma_disassemble_pair built-ins to __builtin_vsx_assemble_pair and
__builtin_vsx_disassemble_pair respectively. This patch is to make
corresponding changes in dgemm kernel. Also made changes in
inputs to those builtins to avoid some potential typecasting issues.
Reference gcc commit id:77ef995c1fbcab76a2a69b9f4700bcfd005d8e62 
							
						 
						
							2021-02-26 20:56:34 -06:00  
				
					
						
							
							
								 
						
							
								292d1af1a0 
								
							 
						 
						
							
							
								
								Update omatcopy_rt.c  
							
							
							
						 
						
							2021-02-24 09:34:14 +01:00  
				
					
						
							
							
								 
						
							
								325b398e3c 
								
							 
						 
						
							
							
								
								Update omatcopy_rt.c  
							
							
							
						 
						
							2021-02-24 09:13:12 +01:00  
				
					
						
							
							
								 
						
							
								6f5667b4d4 
								
							 
						 
						
							
							
								
								Enable optimized S/D OMATCOPY_RT  
							
							
							
						 
						
							2021-02-24 09:03:41 +01:00  
				
					
						
							
							
								 
						
							
								cceeee7806 
								
							 
						 
						
							
							
								
								Add optimized omatcopy_rt  
							
							
							
						 
						
							2021-02-24 09:00:54 +01:00  
				
					
						
							
							
								 
						
							
								0a4546b742 
								
							 
						 
						
							
							
								
								Typo fix  
							
							
							
						 
						
							2021-02-23 13:14:35 +01:00  
				
					
						
							
							
								 
						
							
								b1eed27a54 
								
							 
						 
						
							
							
								
								Replace naive omatcopy_rt with 4x4 blocked implementation  
							
							... 
							
							
							
							as suggested by MigMuc in issue 2532 
							
						 
						
							2021-02-22 21:35:42 +01:00  
				
					
						
							
							
								 
						
							
								47691c031f 
								
							 
						 
						
							
							
								
								Use Haswell optimizations for Zen as well  
							
							
							
						 
						
							2021-02-11 09:26:15 +01:00  
				
					
						
							
							
								 
						
							
								ce7ddd8921 
								
							 
						 
						
							
							
								
								Use Haswell optimizations for Zen as well  
							
							
							
						 
						
							2021-02-11 09:25:36 +01:00  
				
					
						
							
							
								 
						
							
								950c047b49 
								
							 
						 
						
							
							
								
								Use Haswell optimizations for Zen as well  
							
							
							
						 
						
							2021-02-11 09:24:51 +01:00  
				
					
						
							
							
								 
						
							
								46509953a9 
								
							 
						 
						
							
							
								
								Use Haswell optimizations for Zen as well  
							
							
							
						 
						
							2021-02-11 09:24:16 +01:00  
				
					
						
							
							
								 
						
							
								db348dcff2 
								
							 
						 
						
							
							
								
								Enable optimized srot/drot kernels from Haswell  
							
							
							
						 
						
							2021-02-11 09:23:05 +01:00  
				
					
						
							
							
								 
						
							
								2056ffc227 
								
							 
						 
						
							
							
								
								Optimize cscal function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-01-29 13:51:43 -06:00  
				
					
						
							
							
								 
						
							
								3ede843d50 
								
							 
						 
						
							
							
								
								Optimize s/dscal function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-01-24 07:48:28 -06:00  
				
					
						
							
							
								 
						
							
								69a5558203 
								
							 
						 
						
							
							
								
								Merge pull request  #3059  from Guobing-Chen/BF16_gemm  
							
							... 
							
							
							
							Initial code for Cooperlake BF16 GEMM kernel 
							
						 
						
							2021-01-23 19:08:05 +01:00  
				
					
						
							
							
								 
						
							
								d6905403e3 
								
							 
						 
						
							
							
								
								Merge pull request  #3068  from alexhenrie/scan-build  
							
							... 
							
							
							
							scan-build fixes 
							
						 
						
							2021-01-23 19:06:29 +01:00  
				
					
						
							
							
								 
						
							
								439b93f6d2 
								
							 
						 
						
							
							
								
								Optimize s/drot function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-01-21 13:24:45 -06:00  
				
					
						
							
							
								 
						
							
								eff7c9166e 
								
							 
						 
						
							
							
								
								Optimize cdot function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-01-15 13:40:34 -06:00  
				
					
						
							
							
								 
						
							
								202fc9e8ed 
								
							 
						 
						
							
							
								
								Fix uninitialized argument value in dasum_k  
							
							
							
						 
						
							2021-01-14 19:40:31 -07:00  
				
					
						
							
							
								 
						
							
								e378b24487 
								
							 
						 
						
							
							
								
								Merge pull request  #3067  from albertziegenhagel/fix-generic-cmake  
							
							... 
							
							
							
							Fix building "generic" TRMM kernel with CMake 
							
						 
						
							2021-01-14 21:35:19 +01:00  
				
					
						
							
							
								 
						
							
								e3f4063683 
								
							 
						 
						
							
							
								
								Fix building "generic" TRMM kernel with CMake  
							
							... 
							
							
							
							The CMake "TARGET_CORE" variables stores the "generic" target name in all lowercase letters, but gets compared to an all uppercase string, which results in the wrong TRMM kernel being selected.
This commit converts the TARGET_CORE to all uppercase before comparing its value to make sure case mismatches are not an issue in the future anymore. 
							
						 
						
							2021-01-14 10:00:49 +01:00  
				
					
						
							
							
								 
						
							
								b716c0ef01 
								
							 
						 
						
							
							
								
								Add workaround for NVIDIA HPC  
							
							
							
						 
						
							2021-01-12 16:51:35 +01:00  
				
					
						
							
							
								 
						
							
								2efa3b70dc 
								
							 
						 
						
							
							
								
								Add workaround for NVIDIA HPC  
							
							
							
						 
						
							2021-01-12 16:49:39 +01:00  
				
					
						
							
							
								 
						
							
								49959d4f1c 
								
							 
						 
						
							
							
								
								Add workaround for NVIDIA HPC  
							
							
							
						 
						
							2021-01-12 16:47:15 +01:00  
				
					
						
							
							
								 
						
							
								0f27a03607 
								
							 
						 
						
							
							
								
								Add workaround for NVIDIA HPC mishandling of the asm DOT kernels  
							
							
							
						 
						
							2021-01-12 16:39:35 +01:00  
				
					
						
							
							
								 
						
							
								c2a8ebfe69 
								
							 
						 
						
							
							
								
								Add workaround for NVIDIA HPC mishandling of the asm DOT kernels  
							
							
							
						 
						
							2021-01-12 16:38:51 +01:00  
				
					
						
							
							
								 
						
							
								43aac5bacc 
								
							 
						 
						
							
							
								
								Support NVIDIA HPC compiler  
							
							
							
						 
						
							2021-01-12 16:36:12 +01:00  
				
					
						
							
							
								 
						
							
								b0beb0b1ca 
								
							 
						 
						
							
							
								
								Initial code for Cooperlake BF16 GEMM kernel  
							
							
							
						 
						
							2021-01-11 02:15:21 +08:00  
				
					
						
							
							
								 
						
							
								601b711c78 
								
							 
						 
						
							
							
								
								Optimize swap function for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2021-01-08 08:01:36 -06:00  
				
					
						
							
							
								 
						
							
								1b2508362b 
								
							 
						 
						
							
							
								
								arm64: Fix nrm2 for input vectors with Inf  
							
							... 
							
							
							
							Fix double precision nrm2 kernels returning NaN when the
input vectors contain Inf/-Inf. 
							
						 
						
							2021-01-01 02:49:37 -08:00  
				
					
						
							
							
								 
						
							
								3559c5d7a2 
								
							 
						 
						
							
							
								
								Merge pull request  #3048  from martin-frbg/issue2998  
							
							... 
							
							
							
							Temporarily revert to the old NRM2 kernels for ThunderX2/3 and NeoverseN1 
							
						 
						
							2020-12-21 13:30:08 +01:00  
				
					
						
							
							
								 
						
							
								8631e2976a 
								
							 
						 
						
							
							
								
								Temporarily revert to the old nrm2 kernels  
							
							
							
						 
						
							2020-12-21 07:45:13 +01:00  
				
					
						
							
							
								 
						
							
								2768bc1764 
								
							 
						 
						
							
							
								
								Temporarily revert to the old nrm2 kernels  
							
							
							
						 
						
							2020-12-21 07:42:51 +01:00  
				
					
						
							
							
								 
						
							
								6f4698ee1f 
								
							 
						 
						
							
							
								
								Temporarily revert to the old nrm2 kernel  
							
							
							
						 
						
							2020-12-21 07:41:18 +01:00  
				
					
						
							
							
								 
						
							
								114eb159a4 
								
							 
						 
						
							
							
								
								Disable FMA intrinsics in the srot kernel when the compiler is PGI/NVIDIA  
							
							
							
						 
						
							2020-12-19 22:15:58 +01:00  
				
					
						
							
							
								 
						
							
								005cce5507 
								
							 
						 
						
							
							
								
								Amend SkylakeX options to support the NVIDIA compiler  
							
							
							
						 
						
							2020-12-19 22:11:49 +01:00  
				
					
						
							
							
								 
						
							
								a3cac9cca0 
								
							 
						 
						
							
							
								
								Update sgemm kernel 1x4 for C910.  
							
							
							
						 
						
							2020-12-18 11:53:23 +08:00  
				
					
						
							
							
								 
						
							
								c73d8ee40d 
								
							 
						 
						
							
							
								
								Conditionally add -mfma to compiler options where needed  
							
							
							
						 
						
							2020-12-17 11:34:05 +01:00  
				
					
						
							
							
								 
						
							
								2fb11f873b 
								
							 
						 
						
							
							
								
								POWER10: Improve copy performance  
							
							... 
							
							
							
							This patch aligns the stores to 32 byte boundary for scopy and dcopy
before entering into vector pair loop. For ccopy, changed the store
instructions to stxv to improve performance of unaligned cases. 
							
						 
						
							2020-12-13 10:41:45 -06:00  
				
					
						
							
							
								 
						
							
								043128cbe5 
								
							 
						 
						
							
							
								
								Merge pull request  #3029  from RajalakshmiSR/axpyp10  
							
							... 
							
							
							
							POWER10: Improve axpy performance 
							
						 
						
							2020-12-10 22:49:28 +01:00  
				
					
						
							
							
								 
						
							
								3331ca492d 
								
							 
						 
						
							
							
								
								Merge pull request  #3021  from austinpagan/trsm_p10  
							
							... 
							
							
							
							POWER: Added special unrolled vectorized versions of "Solve" for specific si… 
							
						 
						
							2020-12-10 19:42:54 +01:00  
				
					
						
							
							
								 
						
							
								346e30a46a 
								
							 
						 
						
							
							
								
								POWER10: Improve axpy performance  
							
							... 
							
							
							
							This patch aligns the stores to 32 byte boundary for saxpy and daxpy
before entering into vector pair loop. Fox caxpy, changed the store
instructions to stxv to improve performance of unaligned cases. 
							
						 
						
							2020-12-10 11:51:42 -06:00  
				
					
						
							
							
								 
						
							
								4b548857d6 
								
							 
						 
						
							
							
								
								Add msa support for loongson  
							
							... 
							
							
							
							1. Using core loongson3r3 and loongson3r4 for loongson
2. Add DYNAMIC_ARCH for loongson
Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1 
							
						 
						
							2020-12-09 10:28:46 +08:00  
				
					
						
							
							
								 
						
							
								7f11e33e8d 
								
							 
						 
						
							
							
								
								Merge pull request  #3025  from TiredNotTear/develop  
							
							... 
							
							
							
							MIPS: Fix two bugs 
							
						 
						
							2020-12-08 09:39:27 +01:00  
				
					
						
							
							
								 
						
							
								53e0837809 
								
							 
						 
						
							
							
								
								Merge pull request  #3022  from jinboson/develop  
							
							... 
							
							
							
							Fix test errors reported by cblas_cgemm & cblas_ctrmm 
							
						 
						
							2020-12-07 08:09:11 +01:00  
				
					
						
							
							
								 
						
							
								ad38bd0e89 
								
							 
						 
						
							
							
								
								Fix failed cgemv and zgemv test case after using msa optimization  
							
							... 
							
							
							
							The cgemv and zgemv test case will call cgemv_n/t_msa.c zgemv_n/t_msa.c files in MIPS environment.
When the macro CONJ is defined, the calculation result will be wrong due to the wrong definition of OP2.
This patch updates the value of OP2 and passes the corresponding test. 
							
						 
						
							2020-12-07 10:25:01 +08:00  
				
					
						
							
							
								 
						
							
								47b639cc9b 
								
							 
						 
						
							
							
								
								Fix failed sswap and dswap case by using msa optimization  
							
							... 
							
							
							
							The swap test case will call sswap_msa.c and dswap_msa.c files in MIPS environmnet.
When inc_x or inc_y is equal to zero, the calculation result of the two functions will be wrong.
This patch adds the processing of inc_x or inc_y equal to zero, and the swap test case has passed. 
							
						 
						
							2020-12-07 10:24:49 +08:00  
				
					
						
							
							
								 
						
							
								b660008c7e 
								
							 
						 
						
							
							
								
								Work around DOT and SWAP test failures  
							
							
							
						 
						
							2020-12-06 19:15:37 +01:00  
				
					
						
							
							
								 
						
							
								f8346603cf 
								
							 
						 
						
							
							
								
								Fix compilation with SolarisStudio  
							
							
							
						 
						
							2020-12-06 19:14:16 +01:00  
				
					
						
							
							
								 
						
							
								65de6f5957 
								
							 
						 
						
							
							
								
								Fix test errors reported by cblas_cgemm & cblas_ctrmm  
							
							... 
							
							
							
							The file cgemm_kernel_8x4_msa.c holds the MSA optimization
codes of cblas_cgemm and cblas_ctrmm. It defines two
macros: CGEMM_SCALE_1X2 and CGEMM_TRMM_SCALE_1X2. The pc1
array index in the two macros should be 0 and 1. 
							
						 
						
							2020-12-05 15:08:17 +08:00  
				
					
						
							
							
								 
						
							
								213c0e7abb 
								
							 
						 
						
							
							
								
								Added special unrolled vectorized versions of "Solve" for specific sizes,  
							
							... 
							
							
							
							in DTRSM and STRSM, to improve performance in Power9 and Power10. 
							
						 
						
							2020-12-04 17:07:06 -06:00  
				
					
						
							
							
								 
						
							
								441c08c9ff 
								
							 
						 
						
							
							
								
								Merge pull request  #3016  from xiegengxin/complex-asum  
							
							... 
							
							
							
							Improve the performance of zasum and casum with AVX512 intrinsic 
							
						 
						
							2020-12-04 22:07:16 +01:00  
				
					
						
							
							
								 
						
							
								0cb7a403b2 
								
							 
						 
						
							
							
								
								fix error declare function blas_level1_thread_with_return_value  
							
							
							
						 
						
							2020-12-02 09:51:52 +08:00  
				
					
						
							
							
								 
						
							
								b766c1e9bb 
								
							 
						 
						
							
							
								
								Improve the performance of zasum and casum with AVX512 intrinsic  
							
							
							
						 
						
							2020-12-01 16:49:26 +08:00  
				
					
						
							
							
								 
						
							
								7d46e31de1 
								
							 
						 
						
							
							
								
								POWER10:  Optimize dgemv_n  
							
							... 
							
							
							
							Handling as 4x8 with vector pairs gives better performance than
existing code in POWER10. 
							
						 
						
							2020-11-29 15:28:28 -06:00  
				
					
						
							
							
								 
						
							
								f1bf040b25 
								
							 
						 
						
							
							
								
								Merge pull request  #2988  from xiegengxin/smp-asum  
							
							... 
							
							
							
							Improve the performance of dasum and sasum when SMP is defined 
							
						 
						
							2020-11-22 12:24:13 +01:00  
				
					
						
							
							
								 
						
							
								7037849498 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into risc-v  
							
							
							
						 
						
							2020-11-22 16:04:50 +08:00  
				
					
						
							
							
								 
						
							
								7e9cb39a25 
								
							 
						 
						
							
							
								
								Merge pull request  #2981  from Qiyu8/fix-sum  
							
							... 
							
							
							
							Fix sum optimize issues 
							
						 
						
							2020-11-16 08:40:46 +01:00  
				
					
						
							
							
								 
						
							
								d6e7e05bb3 
								
							 
						 
						
							
							
								
								Improve the performance of dasum and sasum when SMP is defined  
							
							
							
						 
						
							2020-11-13 14:20:52 +08:00  
				
					
						
							
							
								 
						
							
								ae0b1dea19 
								
							 
						 
						
							
							
								
								modify system.cmake to enable fma flag  
							
							
							
						 
						
							2020-11-13 10:20:24 +08:00  
				
					
						
							
							
								 
						
							
								e0dac6b53b 
								
							 
						 
						
							
							
								
								fix the CI failure of target specific option mismatch  
							
							
							
						 
						
							2020-11-12 20:31:03 +08:00  
				
					
						
							
							
								 
						
							
								e5c2ceb675 
								
							 
						 
						
							
							
								
								fix the CI failure of lack the head  
							
							
							
						 
						
							2020-11-12 17:35:17 +08:00  
				
					
						
							
							
								 
						
							
								a87e537b8c 
								
							 
						 
						
							
							
								
								modify macro  
							
							
							
						 
						
							2020-11-11 15:53:48 +08:00  
				
					
						
							
							
								 
						
							
								5bc0a7583f 
								
							 
						 
						
							
							
								
								only FMA3 and vector larger than 128 have positive effects.  
							
							
							
						 
						
							2020-11-11 15:18:01 +08:00  
				
					
						
							
							
								 
						
							
								8c0b206d4c 
								
							 
						 
						
							
							
								
								Optimize the performance of rot by using universal intrinsics  
							
							
							
						 
						
							2020-11-11 14:33:12 +08:00  
				
					
						
							
							
								 
						
							
								c4c591ac5a 
								
							 
						 
						
							
							
								
								fix sum optimize issues  
							
							
							
						 
						
							2020-11-10 16:16:38 +08:00  
				
					
						
							
							
								 
						
							
								fc35b72ae1 
								
							 
						 
						
							
							
								
								Refs  #2899  
							
							... 
							
							
							
							Merge branch 'openblas-open-910' of git://github.com/damonyu1989/OpenBLAS into damonyu1989-openblas-open-910 
							
						 
						
							2020-11-10 09:38:04 +08:00  
				
					
						
							
							
								 
						
							
								913cc9a4ca 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into risc-v  
							
							
							
						 
						
							2020-11-10 09:18:25 +08:00  
				
					
						
							
							
								 
						
							
								ff16329cb7 
								
							 
						 
						
							
							
								
								Merge pull request  #2972  from xiegengxin/rot-intrinsic  
							
							... 
							
							
							
							Improve the performance of rot by using AVX512 and AVX2 intrinsic 
							
						 
						
							2020-11-08 22:43:00 +01:00  
				
					
						
							
							
								 
						
							
								110c7a6de0 
								
							 
						 
						
							
							
								
								Merge pull request  #2979  from RajalakshmiSR/dot_power10  
							
							... 
							
							
							
							Optimize sdot/ddot for POWER10 
							
						 
						
							2020-11-08 10:19:34 +01:00  
				
					
						
							
							
								 
						
							
								6e364981a8 
								
							 
						 
						
							
							
								
								Optimize sdot/ddot for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2020-11-07 15:21:58 -06:00  
				
					
						
							
							
								 
						
							
								b976a0bf40 
								
							 
						 
						
							
							
								
								Remove previous workaround for compiler flags related to cpu capabilities in x86_64 DYNAMIC_ARCH builds  
							
							
							
						 
						
							2020-11-07 20:39:56 +01:00  
				
					
						
							
							
								 
						
							
								ff74319ea5 
								
							 
						 
						
							
							
								
								Merge pull request  #2977  from martin-frbg/issue2976  
							
							... 
							
							
							
							Fix macro name used in ifdef for POWERPC/PGI 
							
						 
						
							2020-11-07 14:41:34 +01:00  
				
					
						
							
							
								 
						
							
								28d2dfe2b3 
								
							 
						 
						
							
							
								
								Fix macro name used in ifdef  
							
							
							
						 
						
							2020-11-07 12:17:49 +01:00  
				
					
						
							
							
								 
						
							
								725ffbf041 
								
							 
						 
						
							
							
								
								fix typo  
							
							
							
						 
						
							2020-11-05 16:25:17 +08:00  
				
					
						
							
							
								 
						
							
								d9ba49165a 
								
							 
						 
						
							
							
								
								Improve the performance of rot by using AVX512 and AVX2 intrinsic  
							
							
							
						 
						
							2020-11-05 15:12:36 +08:00  
				
					
						
							
							
								 
						
							
								dd7a9cc5bf 
								
							 
						 
						
							
							
								
								POWER10:  Change dgemm unroll factors  
							
							... 
							
							
							
							Changing the unroll factors for dgemm to 8 shows improved performance with
POWER10 MMA feature.   Also made some minor changes in sgemm for edge cases. 
							
						 
						
							2020-10-31 18:28:57 -05:00  
				
					
						
							
							
								 
						
							
								b435491885 
								
							 
						 
						
							
							
								
								Optimize caxpy for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2020-10-29 14:57:51 -05:00  
				
					
						
							
							
								 
						
							
								a7b1f9b1bb 
								
							 
						 
						
							
							
								
								Implementation of BF16 based gemv  
							
							... 
							
							
							
							1. Add a new API -- sbgemv to support bfloat16 based gemv
2. Implement a generic kernel for sbgemv
3. Implement an avx512-bf16 based kernel for sbgemv
Signed-off-by: Chen, Guobing <guobing.chen@intel.com> 
							
						 
						
							2020-10-29 02:08:23 +08:00  
				
					
						
							
							
								 
						
							
								67f39ad813 
								
							 
						 
						
							
							
								
								Merge pull request  #2939  from thrasibule/Makefile_cleanup  
							
							... 
							
							
							
							reuse variables defined in Makefile.system 
							
						 
						
							2020-10-28 09:38:40 +01:00  
				
					
						
							
							
								 
						
							
								c24ba8b1dd 
								
							 
						 
						
							
							
								
								Optimize saxpy for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. 
							
						 
						
							2020-10-26 13:24:59 -05:00  
				
					
						
							
							
								 
						
							
								6f9460f0f6 
								
							 
						 
						
							
							
								
								Merge pull request  #2937  from martin-frbg/pwr-buffersz  
							
							... 
							
							
							
							Increase and unify BUFFERSIZE on POWER;fix gcc inline warning 
							
						 
						
							2020-10-23 07:15:32 +02:00  
				
					
						
							
							
								 
						
							
								1917a4e7b8 
								
							 
						 
						
							
							
								
								reuse variables defined in Makefile.system  
							
							
							
						 
						
							2020-10-22 22:04:25 -04:00  
				
					
						
							
							
								 
						
							
								34c3c407ef 
								
							 
						 
						
							
							
								
								label always_inline function as inline to silence a gcc warning  
							
							
							
						 
						
							2020-10-22 22:14:26 +02:00  
				
					
						
							
							
								 
						
							
								2e48d560ba 
								
							 
						 
						
							
							
								
								Fix compiler version check  
							
							
							
						 
						
							2020-10-22 16:23:29 +02:00  
				
					
						
							
							
								 
						
							
								ad745c0bae 
								
							 
						 
						
							
							
								
								Optimize scopy/ccopy for POWER10  
							
							... 
							
							
							
							This patch makes use of new POWER10 vector pair instructions for
loads and stores. Also reorganized all variants of copy functions
to make use of same kernel. 
							
						 
						
							2020-10-21 09:53:45 -05:00  
				
					
						
							
							
								 
						
							
								4a1d00f589 
								
							 
						 
						
							
							
								
								Fix build with -Werror=return-type  
							
							... 
							
							
							
							dgemm_tcopy_16_skylakex.c CNAME function should return an int, add a
return 0 similar to other files. 
							
						 
						
							2020-10-21 08:43:39 +02:00  
				
					
						
							
							
								 
						
							
								b073d759d0 
								
							 
						 
						
							
							
								
								x86_64: clobber all xmm registers after vzeroupper  
							
							... 
							
							
							
							As observed using GCC 10 using -march=native -ftree-vectorize
on Knights Landing, it is now smart enough to find clobbers inside
non-inlined static functions.
In particular, sgemv counted on a kernel to preserve the whole
%ymm2 register (since it was not in the clobber list), but the top
part was destroyed by vzeroupper. This caused many tests to fail.
This patch makes sure all xmm (and ymm/zmm by extension) registers
are listed as clobbered to avoid this happening, as most kernels
already did correctly in fact. 
							
						 
						
							2020-10-20 02:16:47 +00:00  
				
					
						
							
							
								 
						
							
								dc6e44c3f8 
								
							 
						 
						
							
							
								
								Merge pull request  #2916  from martin-frbg/issue2911  
							
							... 
							
							
							
							Clean up duplicate definitions in POWER8 kernels and fix power10 option passing 
							
						 
						
							2020-10-19 23:33:31 +02:00  
				
					
						
							
							
								 
						
							
								a61c086408 
								
							 
						 
						
							
							
								
								Fix spurious trailing whitespace in comment  
							
							
							
						 
						
							2020-10-19 09:12:12 +02:00  
				
					
						
							
							
								 
						
							
								03e781b766 
								
							 
						 
						
							
							
								
								sgemm_direct_skylakex: fix  75eeb26 regression.  
							
							... 
							
							
							
							The
`#if defined(SKYLAKEX) || defined (COOPERLAKE)`
from that commit was before #include "common.h" so caused the
compiled function to be empty, returning garbage results for
qualifying sgemm's on those architectures.
Closes  #2914  
							
						 
						
							2020-10-18 19:58:07 +00:00  
				
					
						
							
							
								 
						
							
								f1a4071d8c 
								
							 
						 
						
							
							
								
								Clean up STACKSIZE redefinition  
							
							
							
						 
						
							2020-10-18 19:41:43 +02:00  
				
					
						
							
							
								 
						
							
								97cf10062f 
								
							 
						 
						
							
							
								
								Clean up STACKSIZE redefinition  
							
							
							
						 
						
							2020-10-18 19:39:18 +02:00  
				
					
						
							
							
								 
						
							
								17e288e18d 
								
							 
						 
						
							
							
								
								Clean up STACKSIZE redefinition  
							
							
							
						 
						
							2020-10-18 19:37:04 +02:00  
				
					
						
							
							
								 
						
							
								c1422f3e46 
								
							 
						 
						
							
							
								
								Clean up STACKSIZE redefinition  
							
							
							
						 
						
							2020-10-18 19:31:01 +02:00  
				
					
						
							
							
								 
						
							
								d85b24e103 
								
							 
						 
						
							
							
								
								Clean up STACKSIZE redefinition  
							
							
							
						 
						
							2020-10-18 19:29:45 +02:00  
				
					
						
							
							
								 
						
							
								d7ba7679b6 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into risc-v  
							
							
							
						 
						
							2020-10-16 23:27:38 +08:00  
				
					
						
							
							
								 
						
							
								df70667043 
								
							 
						 
						
							
							
								
								fix core list for sse/sse2  
							
							
							
						 
						
							2020-10-16 09:55:48 +02:00  
				
					
						
							
							
								 
						
							
								f071d1207a 
								
							 
						 
						
							
							
								
								add sse2  
							
							
							
						 
						
							2020-10-15 22:10:32 +02:00  
				
					
						
							
							
								 
						
							
								dc6cefd2f5 
								
							 
						 
						
							
							
								
								Expressly enable -msse for 32bit DYNAMIC_ARCH kernels  
							
							
							
						 
						
							2020-10-15 20:16:15 +02:00