wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								d2385f0d52 
								
							 
						 
						
							
							
								
								modified param.h  
							
							 
							
							
							
						 
						
							2013-12-01 18:02:54 +01:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								5118a7f4d1 
								
							 
						 
						
							
							
								
								small optimizations on dgemm_kernel for Piledriver  
							
							 
							
							
							
						 
						
							2013-10-31 11:53:26 +01:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								e172b70ea2 
								
							 
						 
						
							
							
								
								added cgemm_kernel for Piledriver  
							
							 
							
							
							
						 
						
							2013-10-31 08:38:17 +01:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								7bccff1512 
								
							 
						 
						
							
							
								
								added sgemm_kernel for PILEDRIVER  
							
							 
							
							
							
						 
						
							2013-10-29 22:53:04 +01:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								2840d56aeb 
								
							 
						 
						
							
							
								
								added dgemm_kernel for Piledriver  
							
							 
							
							
							
						 
						
							2013-10-19 09:47:15 +02:00  
						
					 
				
					
						
							
							
								 
								Zhang Xianyi
							
						 
						
							 
							
							
							
							
								
							
							
								6c4a7d0828 
								
							 
						 
						
							
							
								
								Import AMD Piledriver DGEMM kernel generated by AUGEM.  
							
							 
							
							... 
							
							
							
							So far, this kernel doesn't deal with edge.
AUGEM: Automatically Generate High Performance Dense Linear Algebra
Kernels on x86 CPUs.
Qian Wang, Xianyi Zhang, Yunquan Zhang, and Qing Yi. In the
International Conference for High Performance Computing, Networking,
Storage and Analysis (SC'13). Denver, CO. Nov, 2013. 
							
						 
						
							2013-08-25 10:16:01 -03:00  
						
					 
				
					
						
							
							
								 
								Zhang Xianyi
							
						 
						
							 
							
							
							
							
								
							
							
								886cbaf4e4 
								
							 
						 
						
							
							
								
								Support AMD Piledriver by bulldozer kernels.  
							
							 
							
							
							
						 
						
							2013-07-06 12:06:43 -03:00  
						
					 
				
					
						
							
							
								 
								Zhang Xianyi
							
						 
						
							 
							
							
							
							
								
							
							
								6e8501c8a1 
								
							 
						 
						
							
							
								
								Fixed   #239  bug in param.h about BARCELONA and BULLDOZER.  
							
							 
							
							
							
						 
						
							2013-06-29 10:36:01 +08:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								f67fa62851 
								
							 
						 
						
							
							
								
								added dgemv_n_bulldozer.S  
							
							 
							
							
							
						 
						
							2013-06-15 16:42:37 +02:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								d65bbec99b 
								
							 
						 
						
							
							
								
								added new sgemm kernel for BULLDOZER  
							
							 
							
							
							
						 
						
							2013-06-09 15:57:42 +02:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								ba800f0883 
								
							 
						 
						
							
							
								
								correct GEMM_THREAD in param.h  
							
							 
							
							
							
						 
						
							2013-06-08 10:03:59 +02:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								25491e42f9 
								
							 
						 
						
							
							
								
								New dgemm kernel for BULLDOZER: dgemm_kernel_8x2_bulldozer.S  
							
							 
							
							
							
						 
						
							2013-06-08 09:40:17 +02:00  
						
					 
				
					
						
							
							
								 
								wernsaar
							
						 
						
							 
							
							
							
							
								
							
							
								731220f870 
								
							 
						 
						
							
							
								
								changed DGEMM_DEFAULT_P and DGEMM_DEFAULT_Q to 248 for BULLDOZER 64bit  
							
							 
							
							
							
						 
						
							2013-04-30 10:07:17 +02:00  
						
					 
				
					
						
							
							
								 
								Zhang Xianyi
							
						 
						
							 
							
							
							
							
								
							
							
								b7c0fa6bd2 
								
							 
						 
						
							
							
								
								Init AMD Bulldozer codebase.  
							
							 
							
							
							
						 
						
							2012-12-06 07:29:54 -05:00  
						
					 
				
					
						
							
							
								 
								Sébastien Villemot
							
						 
						
							 
							
							
							
							
								
							
							
								01e3c984ce 
								
							 
						 
						
							
							
								
								Fix compilation with TARGET=GENERIC  
							
							 
							
							... 
							
							
							
							Patch applied to Debian package 
							
						 
						
							2012-11-14 21:04:05 +01:00  
						
					 
				
					
						
							
							
								 
								Sylvestre Ledru
							
						 
						
							 
							
							
							
							
								
							
							
								3692b4d631 
								
							 
						 
						
							
							
								
								Improve the detection of sparc  
							
							 
							
							
							
						 
						
							2012-07-02 02:51:38 +02:00  
						
					 
				
					
						
							
							
								 
								Xianyi Zhang
							
						 
						
							 
							
							
							
							
								
							
							
								b39c51195b 
								
							 
						 
						
							
							
								
								Fixed the build bug about Sandy Bridge on 32-bit.  
							
							 
							
							... 
							
							
							
							We used Nehalem/Penryn codes on Sandy Bridge 32-bit. 
							
						 
						
							2012-06-25 14:29:17 +08:00  
						
					 
				
					
						
							
							
								 
								Xianyi Zhang
							
						 
						
							 
							
							
							
							
								
							
							
								996dc6d1c8 
								
							 
						 
						
							
							
								
								Fixed dynamic_arch building bug.  
							
							 
							
							
							
						 
						
							2012-06-19 17:29:06 +08:00  
						
					 
				
					
						
							
							
								 
								wangqian
							
						 
						
							 
							
							
							
							
								
							
							
								f76f952547 
								
							 
						 
						
							
							
								
								Refs  #83   #53 . Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions.  
							
							 
							
							
							
						 
						
							2012-06-19 16:37:12 +08:00  
						
					 
				
					
						
							
							
								 
								Zhang Xianyi
							
						 
						
							 
							
							
							
							
								
							
							
								d3b67d0bd8 
								
							 
						 
						
							
							
								
								Refs  #113 . Fixed the typo BOBCATE -> BOBCAT  
							
							 
							
							
							
						 
						
							2012-05-31 22:40:15 +08:00  
						
					 
				
					
						
							
							
								 
								Zhang Xianyi
							
						 
						
							 
							
							
							
							
								
							
							
								d6cab3f37e 
								
							 
						 
						
							
							
								
								Refs  #113 . Support AMD Bobcate using Barcelona kernel codes. Replace 3DNow! with MMX.  
							
							 
							
							
							
						 
						
							2012-05-31 18:17:45 +08:00  
						
					 
				
					
						
							
							
								 
								Xianyi Zhang
							
						 
						
							 
							
							
							
							
								
							
							
								19a48b82cf 
								
							 
						 
						
							
							
								
								Init Sandybridge codes based on Nehalem.  
							
							 
							
							
							
						 
						
							2012-03-30 20:01:03 +08:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								7af0139a09 
								
							 
						 
						
							
							
								
								Modify P Q R size of Loongson3b.  
							
							 
							
							
							
						 
						
							2012-01-11 16:05:39 +00:00  
						
					 
				
					
						
							
							
								 
								Wang Qian
							
						 
						
							 
							
							
							
							
								
							
							
								66904fc4e8 
								
							 
						 
						
							
							
								
								BLAS3 used standard MIPS instructions without extensions on Loongson 3B.  
							
							 
							
							
							
						 
						
							2011-11-25 11:20:25 +00:00  
						
					 
				
					
						
							
							
								 
								Wang Qian
							
						 
						
							 
							
							
							
							
								
							
							
								8163ab7e55 
								
							 
						 
						
							
							
								
								Change the block size on Loongson 3B.  
							
							 
							
							
							
						 
						
							2011-11-23 18:41:49 +00:00  
						
					 
				
					
						
							
							
								 
								Xianyi Zhang
							
						 
						
							 
							
							
							
							
								
							
							
								b95ad4cfaf 
								
							 
						 
						
							
							
								
								Support detecting ICT Loongson-3B CPU.  
							
							 
							
							
							
						 
						
							2011-11-09 19:29:50 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								831858b883 
								
							 
						 
						
							
							
								
								Modify aligned address of sa and sb to improve the performance of multi-threads.  
							
							 
							
							
							
						 
						
							2011-09-23 20:59:48 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								d238a768ab 
								
							 
						 
						
							
							
								
								Use ps instructions in cgemm.  
							
							 
							
							
							
						 
						
							2011-09-14 15:32:25 +00:00  
						
					 
				
					
						
							
							
								 
								Xianyi Zhang
							
						 
						
							 
							
							
							
							
								
							
							
								4727fe8abf 
								
							 
						 
						
							
							
								
								Refs  #47 . On Loongson 3A, set DGEMM_R parameter depending on different number of threads. It would improve double precision BLAS3 on multi-threads.  
							
							 
							
							
							
						 
						
							2011-09-05 15:13:52 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								74a3f63489 
								
							 
						 
						
							
							
								
								Tuning mb, kb, nb size to get the best performance.  
							
							 
							
							
							
						 
						
							2011-09-01 17:15:28 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								cb0214787b 
								
							 
						 
						
							
							
								
								Modify compile options.  
							
							 
							
							
							
						 
						
							2011-08-30 20:57:00 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								c8360e3ae5 
								
							 
						 
						
							
							
								
								Complete all the plura single precision functions of level3 on Loongson3a, the performance is 2.3GFlops.  
							
							 
							
							
							
						 
						
							2011-07-18 17:03:38 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								e72113f06a 
								
							 
						 
						
							
							
								
								Add ztrmm and ztrsm part on loongson3a. The average performance is 2.2G.  
							
							 
							
							
							
						 
						
							2011-06-23 21:11:00 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								1c96d345e2 
								
							 
						 
						
							
							
								
								Improve zgemm performance from 1G to 1.8G, change block size in param.h.  
							
							 
							
							
							
						 
						
							2011-06-21 22:16:23 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								88d94d0ec8 
								
							 
						 
						
							
							
								
								Fixed   #30  strmm computational error on Loongson3A.  
							
							 
							
							
							
						 
						
							2011-05-28 09:48:34 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								ab9e4ce351 
								
							 
						 
						
							
							
								
								Adjust kc size from 112 to 116 .  
							
							 
							
							
							
						 
						
							2011-04-11 22:17:57 +00:00  
						
					 
				
					
						
							
							
								 
								traz
							
						 
						
							 
							
							
							
							
								
							
							
								1aa9a298e1 
								
							 
						 
						
							
							
								
								Change BLOCK SIZE of LOONGSON3A TARGET.  
							
							 
							
							
							
						 
						
							2011-04-06 10:39:31 +00:00  
						
					 
				
					
						
							
							
								 
								Xianyi Zhang
							
						 
						
							 
							
							
							
							
								
							
							
								0597c1076f 
								
							 
						 
						
							
							
								
								Added the configures of loongson 3a. refs  #1  
							
							 
							
							
							
						 
						
							2011-01-24 22:45:35 +00:00  
						
					 
				
					
						
							
							
								 
								Xianyi Zhang
							
						 
						
							 
							
							
							
							
								
							
							
								342bbc3871 
								
							 
						 
						
							
							
								
								Import GotoBLAS2 1.13 BSD version codes.  
							
							 
							
							
							
						 
						
							2011-01-24 14:54:24 +00:00