1d33547222 
								
							 
						 
						
							
							
								
								optimized zgemm kernel for haswell  
							
							
							
						 
						
							2014-07-27 11:51:42 +02:00  
				
					
						
							
							
								 
						
							
								6c2ead30f0 
								
							 
						 
						
							
							
								
								Remove all trailing whitespace except lapack-netlib  
							
							... 
							
							
							
							Signed-off-by: Timothy Gu <timothygu99@gmail.com> 
							
						 
						
							2014-06-27 12:05:18 -07:00  
				
					
						
							
							
								 
						
							
								c947ab85dc 
								
							 
						 
						
							
							
								
								changed level3.c  
							
							
							
						 
						
							2013-12-01 13:46:30 +01:00  
				
					
						
							
							
								 
						
							
								2840d56aeb 
								
							 
						 
						
							
							
								
								added dgemm_kernel for Piledriver  
							
							
							
						 
						
							2013-10-19 09:47:15 +02:00  
				
					
						
							
							
								 
						
							
								32d2ca3035 
								
							 
						 
						
							
							
								
								Refs  #214 ,  #221 ,  #246 . Fixed the getrf overflow bug on Windows.  
							
							... 
							
							
							
							I used a smaller threshold since the stack size is 1MB on windows. 
							
						 
						
							2013-07-11 03:20:02 +08:00  
				
					
						
							
							
								 
						
							
								6f008abcef 
								
							 
						 
						
							
							
								
								replaced defined(DOUBLE) by !defined(XDOUBLE)  
							
							
							
						 
						
							2013-07-09 18:17:50 +02:00  
				
					
						
							
							
								 
						
							
								5d3312142a 
								
							 
						 
						
							
							
								
								Refs  #221   #246 . Fixed the overflowing stack bug in mutlithreading BLAS3.  
							
							... 
							
							
							
							When NUM_THREADS(MAX_CPU_NUNBERS) is very large ,e.g. 256.
typedef struct {
  volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
job_t          job[MAX_CPU_NUMBER];
The job array is equal 8MB.
Thus, We use malloc instead of stack allocation. 
							
						 
						
							2013-07-08 01:07:05 +08:00  
				
					
						
							
							
								 
						
							
								25491e42f9 
								
							 
						 
						
							
							
								
								New dgemm kernel for BULLDOZER: dgemm_kernel_8x2_bulldozer.S  
							
							
							
						 
						
							2013-06-08 09:40:17 +02:00  
				
					
						
							
							
								 
						
							
								342bbc3871 
								
							 
						 
						
							
							
								
								Import GotoBLAS2 1.13 BSD version codes.  
							
							
							
						 
						
							2011-01-24 14:54:24 +00:00