af0a69f355 
								
							 
						 
						
							
							
								
								Add support for LOONGARCH64  
							
							
							
						 
						
							2021-07-27 15:29:12 +08:00  
				
					
						
							
							
								 
						
							
								a6351e32f0 
								
							 
						 
						
							
							
								
								Remove BLASLONG casts from SPARC entries  
							
							... 
							
							
							
							in response to https://github.com/xianyi/OpenBLAS/pull/3266#issuecomment-878637675  
							
						 
						
							2021-07-14 21:09:36 +02:00  
				
					
						
							
							
								 
						
							
								b7da75e4fd 
								
							 
						 
						
							
							
								
								WiP CORTEX A55 support  
							
							
							
						 
						
							2021-06-19 21:37:51 +02:00  
				
					
						
							
							
								 
						
							
								7dfc45e840 
								
							 
						 
						
							
							
								
								Remove casts for PPC/POWER and complete parameters for POWER3/4  
							
							
							
						 
						
							2021-06-10 11:09:50 +02:00  
				
					
						
							
							
								 
						
							
								198adea961 
								
							 
						 
						
							
							
								
								Changed default P/Q values for CGEMM and ZGEMM (Power10 only)  
							
							
							
						 
						
							2021-03-19 10:05:23 -04:00  
				
					
						
							
							
								 
						
							
								8cdf0825de 
								
							 
						 
						
							
							
								
								Add workaround for older gcc on ppc64be not supporting casts in defines  
							
							
							
						 
						
							2021-03-16 21:20:05 +01:00  
				
					
						
							
							
								 
						
							
								ecb4babcf4 
								
							 
						 
						
							
							
								
								remove inclusion of common.h again to avoid circular dependency  
							
							
							
						 
						
							2021-03-14 17:36:51 +01:00  
				
					
						
							
							
								 
						
							
								30d835168a 
								
							 
						 
						
							
							
								
								Merge pull request  #3088  from xoviat/msvc  
							
							... 
							
							
							
							add misc fixes. 
							
						 
						
							2021-03-14 17:14:28 +01:00  
				
					
						
							
							
								 
						
							
								9579bd47e5 
								
							 
						 
						
							
							
								
								Modifying a couple paramaters in the "POWER10"-specific section of param.h, for performance enhancements for SGEMM and DGEMM.  
							
							
							
						 
						
							2021-03-10 18:19:12 -05:00  
				
					
						
							
							
								 
						
							
								63fa6c832e 
								
							 
						 
						
							
							
								
								Fix build issue on POWER8 with DYNAMIC_ARCH  
							
							... 
							
							
							
							Running make DYNAMIC_ARCH=1 on POWER 8 BE with gcc10.2 version, gives
the following error due to the difference in UNROLL_M/N.
'No rule to make target 'dgemm_incopy_POWER10.o', needed by kernel' 
							
						 
						
							2021-02-11 21:28:03 -06:00  
				
					
						
							
							
								 
						
							
								457ccc42c9 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into msvc  
							
							
							
						 
						
							2021-01-27 14:15:59 -06:00  
				
					
						
							
							
								 
						
							
								ed652d8136 
								
							 
						 
						
							
							
								
								Added definitions for GEMM_PREFERED_SIZE and SWITCH_RATIO to the POWER9 and POWER10 specific sections of param.h.  
							
							
							
						 
						
							2021-01-11 21:13:53 -05:00  
				
					
						
							
							
								 
						
							
								83de62c20d 
								
							 
						 
						
							
							
								
								Merge pull request  #3026  from martin-frbg/revert747  
							
							... 
							
							
							
							Revert PR747 - SYRK parameter changes for Haswell and related targets 
							
						 
						
							2020-12-10 16:29:41 +01:00  
				
					
						
							
							
								 
						
							
								4b548857d6 
								
							 
						 
						
							
							
								
								Add msa support for loongson  
							
							... 
							
							
							
							1. Using core loongson3r3 and loongson3r4 for loongson
2. Add DYNAMIC_ARCH for loongson
Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1 
							
						 
						
							2020-12-09 10:28:46 +08:00  
				
					
						
							
							
								 
						
							
								d71fe4ed4e 
								
							 
						 
						
							
							
								
								Remove GEMM_DEFAULT_UNROLL_MN parameters for Haswell and ZEN (introduced in PR747)  
							
							
							
						 
						
							2020-12-08 21:07:57 +01:00  
				
					
						
							
							
								 
						
							
								b0b14f4e9b 
								
							 
						 
						
							
							
								
								Change comments to C style for compatibility  
							
							
							
						 
						
							2020-12-06 19:12:02 +01:00  
				
					
						
							
							
								 
						
							
								41fe6e864e 
								
							 
						 
						
							
							
								
								POWER10: Update param.h  
							
							... 
							
							
							
							Increasing the values of DGEMM_DEFAULT_P and DGEMM_DEFAULT_Q helps
in improving performance ~10% for DGEMM. 
							
						 
						
							2020-12-03 14:40:11 -06:00  
				
					
						
							
							
								 
						
							
								fc35b72ae1 
								
							 
						 
						
							
							
								
								Refs  #2899  
							
							... 
							
							
							
							Merge branch 'openblas-open-910' of git://github.com/damonyu1989/OpenBLAS into damonyu1989-openblas-open-910 
							
						 
						
							2020-11-10 09:38:04 +08:00  
				
					
						
							
							
								 
						
							
								913cc9a4ca 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into risc-v  
							
							
							
						 
						
							2020-11-10 09:18:25 +08:00  
				
					
						
							
							
								 
						
							
								dd7a9cc5bf 
								
							 
						 
						
							
							
								
								POWER10:  Change dgemm unroll factors  
							
							... 
							
							
							
							Changing the unroll factors for dgemm to 8 shows improved performance with
POWER10 MMA feature.   Also made some minor changes in sgemm for edge cases. 
							
						 
						
							2020-10-31 18:28:57 -05:00  
				
					
						
							
							
								 
						
							
								d7ba7679b6 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into risc-v  
							
							
							
						 
						
							2020-10-16 23:27:38 +08:00  
				
					
						
							
							
								 
						
							
								ef8e7d0279 
								
							 
						 
						
							
							
								
								Add the support for RISC-V Vector.  
							
							... 
							
							
							
							Change-Id: Iae7800a32f5af3903c330882cdf6f292d885f266 
							
						 
						
							2020-10-15 16:09:02 +08:00  
				
					
						
							
							
								 
						
							
								ca31c32693 
								
							 
						 
						
							
							
								
								Rename "HALF" and "sh" to "BFLOAT16" and "sb"  
							
							
							
						 
						
							2020-10-11 23:49:22 +02:00  
				
					
						
							
							
								 
						
							
								e740c4873d 
								
							 
						 
						
							
							
								
								Enable COOPERLAKE build target  
							
							... 
							
							
							
							Enable new build target platform -- COOPERLAKE. This target platform
supports all the SKYLAKEX supported ISAs + avx512bf16. So all the
SKYLAKEX specific kernels/drivers and related code are now extended
to be also active on COOPERLAKE. Besides, new BF16 related kernels
are active under this target. 
							
						 
						
							2020-08-13 06:18:00 +08:00  
				
					
						
							
							
								 
						
							
								e115c97e05 
								
							 
						 
						
							
							
								
								s390x/SGEMM: adjust default P and Q to multiples of M  
							
							... 
							
							
							
							We recently changed the register blocking for SGEMM on s390x to 16x4.
However, we did not adjust Q to a multiple of 16 and thus fell back to
the 8x4 kernel at each block's margin, without need. Adjust P and Q to
multiples of 16 to employ the faster 16x4 kernel for complete full-sized
blocks.
Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com> 
							
						 
						
							2020-08-11 12:56:46 +02:00  
				
					
						
							
							
								 
						
							
								4e1be0e481 
								
							 
						 
						
							
							
								
								ARM64: Add THUNDERX3T110 Target  
							
							
							
						 
						
							2020-07-26 23:32:24 -07:00  
				
					
						
							
							
								 
						
							
								bd2498c886 
								
							 
						 
						
							
							
								
								Use POWER6 GEMM parameters on 32bit POWER8  
							
							
							
						 
						
							2020-07-14 18:07:58 +02:00  
				
					
						
							
							
								 
						
							
								d23419accc 
								
							 
						 
						
							
							
								
								powerpc: Optimized SHGEMM kernel for POWER10  
							
							... 
							
							
							
							This patch introduces new optimized version of SHGEMM kernel
using power10 Matrix-Multiply Assist (MMA) feature introduced in
POWER ISA v3.1. This patch makes use of new POWER10 compute instructions
for matrix multiplication operation.
Tested on simulator and there are no new test failures. 
							
						 
						
							2020-06-25 22:19:08 -05:00  
				
					
						
							
							
								 
						
							
								9fe930f205 
								
							 
						 
						
							
							
								
								powerpc: Add support for future processor  
							
							... 
							
							
							
							This is the initial patch to support build infrastructure
for POWER10 architecture. 
							
						 
						
							2020-06-11 15:47:20 -05:00  
				
					
						
							
							
								 
						
							
								f16e39554d 
								
							 
						 
						
							
							
								
								Change PPCG4 CGEMM_M to match kernel change  
							
							
							
						 
						
							2020-06-03 09:15:29 +02:00  
				
					
						
							
							
								 
						
							
								ea5bdc3f72 
								
							 
						 
						
							
							
								
								split cortex-a53 param to match 8x8 kernel  
							
							
							
						 
						
							2020-05-20 22:34:47 +08:00  
				
					
						
							
							
								 
						
							
								1b0b4349a1 
								
							 
						 
						
							
							
								
								s390x/Z14: Change register blocking for SGEMM to 16x4  
							
							... 
							
							
							
							Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4
by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy
implementations. Actually make KERNEL.Z14 more flexible, so that the
change in param.h suffices. As a result, performance for SGEMM improves
by around 30% on z15.
On z14, FP SIMD instructions can operate on float-sized scalars in
vector registers, while z13 could do that for double-sized scalars only.
Thus, we can double the amount of elements of C that are held in
registers in an SGEMM kernel.
Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com> 
							
						 
						
							2020-05-12 15:59:51 +02:00  
				
					
						
							
							
								 
						
							
								03ff213c51 
								
							 
						 
						
							
							
								
								Increase POWER8 ZGEMM_R and use same R values for POWER9  
							
							... 
							
							
							
							fixes lapack-test zger failures seen in #2299  after application of my PR #2551  
							
						 
						
							2020-04-24 21:46:54 +02:00  
				
					
						
							
							
								 
						
							
								00172d440b 
								
							 
						 
						
							
							
								
								Typo fix in MIPS24K addition  
							
							
							
						 
						
							2020-04-18 21:16:49 +02:00  
				
					
						
							
							
								 
						
							
								61bbae3ac1 
								
							 
						 
						
							
							
								
								Handle  MIPS24K like P5600  
							
							... 
							
							
							
							and allow enforcing TARGET=1004K as well (omission from earlier 1004K merge and later introduction of TARGET check) 
							
						 
						
							2020-04-18 21:09:32 +02:00  
				
					
						
							
							
								 
						
							
								a33d177430 
								
							 
						 
						
							
							
								
								Increase default BUFFER_SIZE on ARM, ZARCH and newer x86_64, add GEMM_R for POWER8/9  
							
							... 
							
							
							
							As shown in #2538 , default buffersizes on some platforms were smaller than required in memory.c
and the requirement could never be fulfilled for a calculated GEMM_R on PPC given the fomula used 
							
						 
						
							2020-04-12 19:44:48 +02:00  
				
					
						
							
							
								 
						
							
								567d2760e6 
								
							 
						 
						
							
							
								
								Merge pull request  #2520  from wjc404/develop  
							
							... 
							
							
							
							Fix avx512 sgemm performance bug when ldc is a multiple of 1024 
							
						 
						
							2020-03-30 20:15:59 +02:00  
				
					
						
							
							
								 
						
							
								64daad4365 
								
							 
						 
						
							
							
								
								Update param.h  
							
							
							
						 
						
							2020-03-20 21:46:18 +00:00  
				
					
						
							
							
								 
						
							
								ea8eec5d17 
								
							 
						 
						
							
							
								
								Merge pull request  #2422  from wjc404/develop  
							
							... 
							
							
							
							Adjust SkylakeX GEMM3M parameters, add an AVX512 STRMM kernel and fix performance bugs in AVX2 s/c/z GEMM 
							
						 
						
							2020-02-29 19:07:35 +01:00  
				
					
						
							
							
								 
						
							
								c623a965f9 
								
							 
						 
						
							
							
								
								Add Neoverse-N1 core  
							
							... 
							
							
							
							The implementation is a hybird of the ARMV8 one with some of the
improved TX2 rountines along with specifying -march=v8.2-a 
							
						 
						
							2020-02-29 03:22:04 +00:00  
				
					
						
							
							
								 
						
							
								265ab484c8 
								
							 
						 
						
							
							
								
								Change default RISC-V 64-bit corename to RISCV64_GENERIC  
							
							... 
							
							
							
							e.g. make CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran TARGET=RISCV64_GENERIC HOSTCC=gcc 
							
						 
						
							2020-02-27 14:46:15 +08:00  
				
					
						
							
							
								 
						
							
								4aa2d89217 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into risc-v  
							
							
							
						 
						
							2020-02-27 13:53:49 +08:00  
				
					
						
							
							
								 
						
							
								8164fd1328 
								
							 
						 
						
							
							
								
								Always assume server-class cpu count for TSV110 and EMAG8180  
							
							
							
						 
						
							2020-02-26 22:19:57 +01:00  
				
					
						
							
							
								 
						
							
								71e5669c3e 
								
							 
						 
						
							
							
								
								Add preliminary support for EMAG8180 ARMV8 processor  
							
							
							
						 
						
							2020-02-19 18:57:26 +01:00  
				
					
						
							
							
								 
						
							
								b0558c11b9 
								
							 
						 
						
							
							
								
								Update param.h  
							
							
							
						 
						
							2020-02-16 23:01:31 +08:00  
				
					
						
							
							
								 
						
							
								83b6be7976 
								
							 
						 
						
							
							
								
								Update param.h  
							
							
							
						 
						
							2020-02-04 19:55:26 +08:00  
				
					
						
							
							
								 
						
							
								f3f969f681 
								
							 
						 
						
							
							
								
								Update param.h  
							
							
							
						 
						
							2020-02-03 21:34:12 +08:00  
				
					
						
							
							
								 
						
							
								fbf4f48f4a 
								
							 
						 
						
							
							
								
								fix a few performance drop in some matrix size per data type  
							
							... 
							
							
							
							Signed-off-by: Wang,Long <long1.wang@intel.com> 
							
						 
						
							2020-01-22 15:15:04 +00:00  
				
					
						
							
							
								 
						
							
								1c67567008 
								
							 
						 
						
							
							
								
								improve skylakex paralleled sgemm performance  
							
							
							
						 
						
							2020-01-13 16:26:03 +08:00  
				
					
						
							
							
								 
						
							
								b7b408a120 
								
							 
						 
						
							
							
								
								optimize AVX2 SGEMM  
							
							
							
						 
						
							2020-01-06 12:16:09 +08:00