1569bf14f8 
								
							 
						 
						
							
							
								
								Refs  #282 . Fixed zgemv_n typo bug on Win64.  
							
							
							
						 
						
							2013-08-23 16:27:17 +08:00  
				
					
						
							
							
								 
						
							
								f51a849d91 
								
							 
						 
						
							
							
								
								Merge pull request  #278  from wernsaar/haswell  
							
							... 
							
							
							
							Merge wernsaar's Haswell gemm kernels. 
							
						 
						
							2013-08-17 08:24:37 -07:00  
				
					
						
							
							
								 
						
							
								44ef70420c 
								
							 
						 
						
							
							
								
								added cgemm_kernel_8x2_haswell.S  
							
							
							
						 
						
							2013-08-16 18:54:56 +02:00  
				
					
						
							
							
								 
						
							
								d488b1b1aa 
								
							 
						 
						
							
							
								
								added zgemm_kernel_4x2_haswell.S  
							
							
							
						 
						
							2013-08-16 10:29:47 +02:00  
				
					
						
							
							
								 
						
							
								4070d9a123 
								
							 
						 
						
							
							
								
								added dgemm_kernel_16x2_haswell.S  
							
							
							
						 
						
							2013-08-15 19:17:20 +02:00  
				
					
						
							
							
								 
						
							
								0b90c0ec64 
								
							 
						 
						
							
							
								
								added sgemm_kernel_16x4_haswell.S  
							
							
							
						 
						
							2013-08-15 18:46:14 +02:00  
				
					
						
							
							
								 
						
							
								2b8ab8f55b 
								
							 
						 
						
							
							
								
								sgemm_kernel_16x4_haswell.S minor changes  
							
							
							
						 
						
							2013-08-14 01:44:41 +02:00  
				
					
						
							
							
								 
						
							
								1cb9579cd0 
								
							 
						 
						
							
							
								
								added zgemm_kernel_4x2_haswell.S and fixed a bug in sgemm_kernel_16x4_haswell.S  
							
							
							
						 
						
							2013-08-14 01:23:15 +02:00  
				
					
						
							
							
								 
						
							
								2638370844 
								
							 
						 
						
							
							
								
								Init code base for Intel Haswell.  
							
							
							
						 
						
							2013-08-13 00:54:59 +08:00  
				
					
						
							
							
								 
						
							
								89637f87c8 
								
							 
						 
						
							
							
								
								added sgemm- and dgemm-kernel for HASWELL processor  
							
							
							
						 
						
							2013-08-12 18:04:10 +02:00  
				
					
						
							
							
								 
						
							
								c0159d44a3 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of  https://github.com/wernsaar/OpenBLAS  into wernsaar-develop  
							
							
							
						 
						
							2013-08-09 10:48:46 +08:00  
				
					
						
							
							
								 
						
							
								c17a850c1c 
								
							 
						 
						
							
							
								
								modified KERNEL.BULLDOZER  
							
							
							
						 
						
							2013-08-08 17:49:30 +02:00  
				
					
						
							
							
								 
						
							
								099853fff6 
								
							 
						 
						
							
							
								
								added dtrsm_kernel_RN_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-08 07:14:08 +02:00  
				
					
						
							
							
								 
						
							
								44d23881b5 
								
							 
						 
						
							
							
								
								dtrsm_kernel_LT_8x2_bulldozer.S performance optimization  
							
							
							
						 
						
							2013-08-05 11:27:16 +02:00  
				
					
						
							
							
								 
						
							
								32fb6b9bb2 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of  https://github.com/wernsaar/OpenBLAS  into wernsaar-develop  
							
							
							
						 
						
							2013-08-05 16:09:47 +08:00  
				
					
						
							
							
								 
						
							
								aaeb8eaecd 
								
							 
						 
						
							
							
								
								modified dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-04 12:16:12 +02:00  
				
					
						
							
							
								 
						
							
								8aeec32ea0 
								
							 
						 
						
							
							
								
								modified dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-04 10:15:33 +02:00  
				
					
						
							
							
								 
						
							
								87fc9de572 
								
							 
						 
						
							
							
								
								added dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-04 09:54:40 +02:00  
				
					
						
							
							
								 
						
							
								564aa60fec 
								
							 
						 
						
							
							
								
								removed dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 15:40:51 +02:00  
				
					
						
							
							
								 
						
							
								f645665dd6 
								
							 
						 
						
							
							
								
								fixed bug in dgemv_t_bulldozer.S  
							
							
							
						 
						
							2013-08-03 12:19:29 +02:00  
				
					
						
							
							
								 
						
							
								e45a347cd2 
								
							 
						 
						
							
							
								
								repaired trmm bug in sgemm_kernel_16x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 11:43:25 +02:00  
				
					
						
							
							
								 
						
							
								99727ac013 
								
							 
						 
						
							
							
								
								repaired trmm bug in cgemm_kernel_4x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 10:32:51 +02:00  
				
					
						
							
							
								 
						
							
								6e0a2fbc0c 
								
							 
						 
						
							
							
								
								repaired trmm bug in zgemm_kernel_2x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 10:17:08 +02:00  
				
					
						
							
							
								 
						
							
								0a22f99c58 
								
							 
						 
						
							
							
								
								repaired trmm bug in dgemm_kernel_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 09:35:39 +02:00  
				
					
						
							
							
								 
						
							
								cff70a666d 
								
							 
						 
						
							
							
								
								added generic trmm kernels and modified Makefile.L3  
							
							
							
						 
						
							2013-07-30 20:18:57 +02:00  
				
					
						
							
							
								 
						
							
								84bd0aabaa 
								
							 
						 
						
							
							
								
								added dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-07-28 16:47:58 +02:00  
				
					
						
							
							
								 
						
							
								72b1edaf1b 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into bulldozer  
							
							... 
							
							
							
							Conflicts:
	kernel/x86_64/KERNEL.BULLDOZER 
							
						 
						
							2013-07-28 06:38:25 +02:00  
				
					
						
							
							
								 
						
							
								1b3b9e841d 
								
							 
						 
						
							
							
								
								Fixed a computational error in zgemm_kernel_4x4_sandy.S file.  
							
							
							
						 
						
							2013-07-18 20:23:21 +08:00  
				
					
						
							
							
								 
						
							
								2ed0f6ab60 
								
							 
						 
						
							
							
								
								Fixed the typo.  
							
							
							
						 
						
							2013-07-11 23:47:07 +08:00  
				
					
						
							
							
								 
						
							
								886cbaf4e4 
								
							 
						 
						
							
							
								
								Support AMD Piledriver by bulldozer kernels.  
							
							
							
						 
						
							2013-07-06 12:06:43 -03:00  
				
					
						
							
							
								 
						
							
								57944538b6 
								
							 
						 
						
							
							
								
								Use ALIGN_5 instead of .algin 32 in assembly kernel. Added ALIGN_5 for 32-bit OSX.  
							
							
							
						 
						
							2013-07-01 16:09:05 +08:00  
				
					
						
							
							
								 
						
							
								fa916a0fac 
								
							 
						 
						
							
							
								
								Fixed   #238  bug in lsame on x86.  
							
							
							
						 
						
							2013-06-28 22:43:41 +08:00  
				
					
						
							
							
								 
						
							
								fb298b34ae 
								
							 
						 
						
							
							
								
								Merge pull request  #235  from wernsaar/develop  
							
							... 
							
							
							
							Added ddot, daxpy, dcopy kernels for AMD bulldozer. 
							
						 
						
							2013-06-21 17:59:26 -07:00  
				
					
						
							
							
								 
						
							
								16012767f4 
								
							 
						 
						
							
							
								
								added dcopy_bulldozer.S  
							
							
							
						 
						
							2013-06-21 16:06:51 +02:00  
				
					
						
							
							
								 
						
							
								bcbac31b47 
								
							 
						 
						
							
							
								
								added ddot_bulldozer.S  
							
							
							
						 
						
							2013-06-20 16:15:09 +02:00  
				
					
						
							
							
								 
						
							
								8dc0c72583 
								
							 
						 
						
							
							
								
								added daxpy_bulldozer.S  
							
							
							
						 
						
							2013-06-20 14:07:54 +02:00  
				
					
						
							
							
								 
						
							
								89405a1a0b 
								
							 
						 
						
							
							
								
								cleanup of dgemm_ncopy_8_bulldozer.S  
							
							
							
						 
						
							2013-06-19 19:31:38 +02:00  
				
					
						
							
							
								 
						
							
								4f2b12b8a8 
								
							 
						 
						
							
							
								
								added dgemv_t_bulldozer.S  
							
							
							
						 
						
							2013-06-19 17:32:42 +02:00  
				
					
						
							
							
								 
						
							
								646e168d26 
								
							 
						 
						
							
							
								
								Merge pull request  #233  from wernsaar/develop  
							
							... 
							
							
							
							added dgemv_n and some faster gemm_copy routines to BULLDOZER. 
							
						 
						
							2013-06-18 20:02:36 -07:00  
				
					
						
							
							
								 
						
							
								93dbbe1fb8 
								
							 
						 
						
							
							
								
								added dgemm_ncopy_8_bulldozer.S  
							
							
							
						 
						
							2013-06-18 13:29:23 +02:00  
				
					
						
							
							
								 
						
							
								a135f5d9ed 
								
							 
						 
						
							
							
								
								added gemm_tcopy_2_bulldozer.S  
							
							
							
						 
						
							2013-06-18 11:01:33 +02:00  
				
					
						
							
							
								 
						
							
								d0b6299b13 
								
							 
						 
						
							
							
								
								added dgemm_tcopy_8_bulldozer.S  
							
							
							
						 
						
							2013-06-17 14:19:09 +02:00  
				
					
						
							
							
								 
						
							
								9e58dd509e 
								
							 
						 
						
							
							
								
								added gemm_ncopy_2_bulldozer.S  
							
							
							
						 
						
							2013-06-17 12:55:12 +02:00  
				
					
						
							
							
								 
						
							
								7c8227101b 
								
							 
						 
						
							
							
								
								cleanup of dgemv_n_bulldozer.S and optimization of inner loop  
							
							
							
						 
						
							2013-06-16 12:50:45 +02:00  
				
					
						
							
							
								 
						
							
								f67fa62851 
								
							 
						 
						
							
							
								
								added dgemv_n_bulldozer.S  
							
							
							
						 
						
							2013-06-15 16:42:37 +02:00  
				
					
						
							
							
								 
						
							
								cd1d473ba0 
								
							 
						 
						
							
							
								
								Merge pull request  #230  from wernsaar/develop  
							
							... 
							
							
							
							Refs #230 . New dgemm and sgemm Kernel for BULLDOZER 
							
						 
						
							2013-06-13 07:29:27 -07:00  
				
					
						
							
							
								 
						
							
								0ded1fcc1c 
								
							 
						 
						
							
							
								
								performance optimizations in sgemm_kernel_16x2_bulldozer.S  
							
							
							
						 
						
							2013-06-13 11:35:15 +02:00  
				
					
						
							
							
								 
						
							
								a789b588cd 
								
							 
						 
						
							
							
								
								added cgemm_kernel_4x2_bulldozer.S  
							
							
							
						 
						
							2013-06-12 15:55:27 +02:00  
				
					
						
							
							
								 
						
							
								8eaa04acbb 
								
							 
						 
						
							
							
								
								added zgemm_kernel_2x2_bulldozer.S  
							
							
							
						 
						
							2013-06-11 12:00:49 +02:00  
				
					
						
							
							
								 
						
							
								d854b30ae6 
								
							 
						 
						
							
							
								
								Added UNROLL values for 3M to getarch_2nd.c, Makefile.system and Makefile.L3  
							
							
							
						 
						
							2013-06-09 17:26:42 +02:00  
				
					
						
							
							
								 
						
							
								d65bbec99b 
								
							 
						 
						
							
							
								
								added new sgemm kernel for BULLDOZER  
							
							
							
						 
						
							2013-06-09 15:57:42 +02:00  
				
					
						
							
							
								 
						
							
								e4c39c7c26 
								
							 
						 
						
							
							
								
								changed stack touching  
							
							
							
						 
						
							2013-06-08 10:43:08 +02:00  
				
					
						
							
							
								 
						
							
								25491e42f9 
								
							 
						 
						
							
							
								
								New dgemm kernel for BULLDOZER: dgemm_kernel_8x2_bulldozer.S  
							
							
							
						 
						
							2013-06-08 09:40:17 +02:00  
				
					
						
							
							
								 
						
							
								9f59f384d8 
								
							 
						 
						
							
							
								
								Refs  #223 . Fixed s/dgemv bug on windows.  
							
							
							
						 
						
							2013-06-04 16:01:05 +08:00  
				
					
						
							
							
								 
						
							
								23965f164c 
								
							 
						 
						
							
							
								
								Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86_64.  
							
							
							
						 
						
							2013-05-29 19:48:31 +08:00  
				
					
						
							
							
								 
						
							
								6a72840945 
								
							 
						 
						
							
							
								
								Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86.  
							
							
							
						 
						
							2013-05-29 13:23:12 +08:00  
				
					
						
							
							
								 
						
							
								69aa6c8fb1 
								
							 
						 
						
							
							
								
								bad performance with some data  
							
							
							
						 
						
							2013-04-28 11:14:23 +02:00  
				
					
						
							
							
								 
						
							
								60b263f3d2 
								
							 
						 
						
							
							
								
								removed trsm_kernel_RT_4x4_bulldozer.S. wrong results  
							
							
							
						 
						
							2013-04-27 17:23:08 +02:00  
				
					
						
							
							
								 
						
							
								7ac306e0da 
								
							 
						 
						
							
							
								
								added trsm_kernel_RT_4x4_bulldozer.S  
							
							
							
						 
						
							2013-04-27 16:48:48 +02:00  
				
					
						
							
							
								 
						
							
								4cb454cdf2 
								
							 
						 
						
							
							
								
								added trsm_kernel_LT_4x4_bulldozer.S  
							
							
							
						 
						
							2013-04-27 14:30:00 +02:00  
				
					
						
							
							
								 
						
							
								19ad2fb128 
								
							 
						 
						
							
							
								
								prefetch improved. Defined 2 different kernels for inner loop  
							
							
							
						 
						
							2013-04-27 13:40:49 +02:00  
				
					
						
							
							
								 
						
							
								6821677489 
								
							 
						 
						
							
							
								
								minor improvements and code cleanup  
							
							
							
						 
						
							2013-04-26 20:05:42 +02:00  
				
					
						
							
							
								 
						
							
								3326f3152c 
								
							 
						 
						
							
							
								
								Merge pull request  #213  from wernsaar/develop  
							
							... 
							
							
							
							Merged some improvements into dgemm_kernel_4x4_bulldozer.S. 
							
						 
						
							2013-04-17 23:56:09 -07:00  
				
					
						
							
							
								 
						
							
								7641f6e253 
								
							 
						 
						
							
							
								
								Merged some improvements into dgemm_kernel_4x4_bulldozer.S.  
							
							... 
							
							
							
							Changed the copy functions to generic to solve prefetch conflicts 
							
						 
						
							2013-04-16 19:05:06 +02:00  
				
					
						
							
							
								 
						
							
								3ad29452d1 
								
							 
						 
						
							
							
								
								Merge pull request  #211  from wernsaar/develop  
							
							... 
							
							
							
							New version of dgemm_kernel_4x4_bulldozer.S 
							
						 
						
							2013-04-15 00:20:55 -07:00  
				
					
						
							
							
								 
						
							
								6e3f6f25a5 
								
							 
						 
						
							
							
								
								New version of dgemm_kernel_4x4_bulldozer.S  
							
							... 
							
							
							
							The peak performance with 8 cores is now 90 GFlops 
							
						 
						
							2013-04-12 17:55:51 +02:00  
				
					
						
							
							
								 
						
							
								724ae159ce 
								
							 
						 
						
							
							
								
								Fixed the Windows x86_64 ABI bug in s/daxpy kernels.  
							
							
							
						 
						
							2013-03-08 22:28:34 +08:00  
				
					
						
							
							
								 
						
							
								f300ce3df5 
								
							 
						 
						
							
							
								
								new optimization of dgemm kernel for bulldozer: 10% performance increase  
							
							
							
						 
						
							2013-03-06 17:26:03 +01:00  
				
					
						
							
							
								 
						
							
								66e64131ed 
								
							 
						 
						
							
							
								
								optimized again bulldozer dgemm kernel  
							
							
							
						 
						
							2013-03-05 19:51:37 +01:00  
				
					
						
							
							
								 
						
							
								9405f26f4b 
								
							 
						 
						
							
							
								
								new dgemm_kernel for bulldozer  
							
							
							
						 
						
							2013-03-04 17:37:38 +01:00  
				
					
						
							
							
								 
						
							
								5c8bf6ae0e 
								
							 
						 
						
							
							
								
								Merge branch 'bulldozer' into develop  
							
							
							
						 
						
							2013-02-10 01:19:42 +08:00  
				
					
						
							
							
								 
						
							
								a1ead62f28 
								
							 
						 
						
							
							
								
								Disable the warning of sgemm bulldozer kernel.  
							
							
							
						 
						
							2013-02-09 17:03:13 +01:00  
				
					
						
							
							
								 
						
							
								0133580148 
								
							 
						 
						
							
							
								
								Used sgemm bulldozer kernel on 64 bit.  
							
							
							
						 
						
							2013-02-09 16:29:14 +01:00  
				
					
						
							
							
								 
						
							
								274246651d 
								
							 
						 
						
							
							
								
								Merge branch 'bulldozer' of git://github.com/wernsaar/OpenBLAS into bulldozer  
							
							
							
						 
						
							2013-02-09 16:25:07 +01:00  
				
					
						
							
							
								 
						
							
								299b5a44dc 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of github.com:xianyi/OpenBLAS into bulldozer  
							
							
							
						 
						
							2013-02-09 16:22:04 +01:00  
				
					
						
							
							
								 
						
							
								d311236dfd 
								
							 
						 
						
							
							
								
								Refs  #189 . Fixed the bug of s/cdot about invalid reading NAN on x86_64.  
							
							
							
						 
						
							2013-01-25 20:56:14 +08:00  
				
					
						
							
							
								 
						
							
								0b08f7479e 
								
							 
						 
						
							
							
								
								Refs  #154 . Fixed gemv_t bug about overflow 16MB buffer on x86.  
							
							
							
						 
						
							2013-01-20 21:22:12 +08:00  
				
					
						
							
							
								 
						
							
								99d1978df7 
								
							 
						 
						
							
							
								
								Fixed   #180 . the typos in kernel/x86_64/sgemv_t.S  
							
							
							
						 
						
							2013-01-12 12:31:14 +08:00  
				
					
						
							
							
								 
						
							
								08bf6674d5 
								
							 
						 
						
							
							
								
								Refs  #177 . Fixed sgemv_t compiling bug on Win64.  
							
							
							
						 
						
							2013-01-05 11:36:39 +08:00  
				
					
						
							
							
								 
						
							
								69200884e1 
								
							 
						 
						
							
							
								
								Refs  #173 . Fixed overflow internal buffer bug of gemv_n on x86  
							
							
							
						 
						
							2012-12-25 09:27:49 +08:00  
				
					
						
							
							
								 
						
							
								0d1518add9 
								
							 
						 
						
							
							
								
								Refs  #173 . Fixed overflow internal buffer bug of sgemv_t on x86  
							
							
							
						 
						
							2012-12-25 09:10:17 +08:00  
				
					
						
							
							
								 
						
							
								91ed4e4450 
								
							 
						 
						
							
							
								
								Refs  #171 . Prevent loading the dirty number from the buffer in sgemv_t x86 kernel.  
							
							
							
						 
						
							2012-12-23 23:14:17 +08:00  
				
					
						
							
							
								 
						
							
								fd3046b32a 
								
							 
						 
						
							
							
								
								Refs  #173 . Fixed overflow internal buffer bug of gemv_t on x86.  
							
							
							
						 
						
							2012-12-23 21:47:22 +08:00  
				
					
						
							
							
								 
						
							
								9fb341a9f8 
								
							 
						 
						
							
							
								
								set parameters for CORE_ATHLON  
							
							... 
							
							
							
							else dgemm_p is set to zero leading to a segfault in alloc_mmap due to
allocsize being zero 
							
						 
						
							2012-12-15 16:05:33 +01:00  
				
					
						
							
							
								 
						
							
								d48cff8cf1 
								
							 
						 
						
							
							
								
								Added optimized sgemm_kernel  
							
							
							
						 
						
							2012-12-08 18:50:53 +01:00  
				
					
						
							
							
								 
						
							
								f19af5ecc0 
								
							 
						 
						
							
							
								
								Refs  #54 . Added AMD Bulldozer x86_64 dgemm kernel developed by Werner Saar <wernsaar at googlemail.com>  
							
							... 
							
							
							
							Based on the dgemm kernel for AMD Barcelona, he used AVX and FMA4 instructions.
Thank Werner Saar! 
							
						 
						
							2012-12-07 01:05:11 +08:00  
				
					
						
							
							
								 
						
							
								bfaaa975e6 
								
							 
						 
						
							
							
								
								Added BULLDOZER target. So far it uses barcelona kernels.  
							
							
							
						 
						
							2012-12-07 00:53:31 +08:00  
				
					
						
							
							
								 
						
							
								b7c0fa6bd2 
								
							 
						 
						
							
							
								
								Init AMD Bulldozer codebase.  
							
							
							
						 
						
							2012-12-06 07:29:54 -05:00  
				
					
						
							
							
								 
						
							
								cea1a885b5 
								
							 
						 
						
							
							
								
								Refs  #154 . Fixed the build bug of dgemv_t on MinW64.  
							
							
							
						 
						
							2012-11-27 07:24:04 +08:00  
				
					
						
							
							
								 
						
							
								5f0117385e 
								
							 
						 
						
							
							
								
								Refs  #154 . Fixed a SEGFAULT bug of dgemv_t when m is very large.  
							
							... 
							
							
							
							It overflowed the internal buffer. Thus, we split vector x into blocks when m is very large.
Thank @wangqian for this patch. 
							
						 
						
							2012-11-19 22:32:27 +08:00  
				
					
						
							
							
								 
						
							
								2573311308 
								
							 
						 
						
							
							
								
								refs  #140 . Fixed zdot incompatibility ABI issue with GCC 4.7 on Win 32.  
							
							... 
							
							
							
							GCC 4.7 uses MSVC ABI on Win 32. This means the caller pops the hidden pointer for returning
aggregate structures larger than 8 bytes. 
							
						 
						
							2012-09-24 20:34:33 +08:00  
				
					
						
							
							
								 
						
							
								d0e731e8b8 
								
							 
						 
						
							
							
								
								provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make on the command line  
							
							
							
						 
						
							2012-08-21 00:31:12 -04:00  
				
					
						
							
							
								 
						
							
								25f1a573fd 
								
							 
						 
						
							
							
								
								Fixed the build bug when DYNAMIC_ARCH=0.  
							
							
							
						 
						
							2012-07-07 12:12:24 +08:00  
				
					
						
							
							
								 
						
							
								857a0fa0df 
								
							 
						 
						
							
							
								
								Fixed the issue of mixing AVX and SSE codes in S/D/C/ZGEMM.  
							
							
							
						 
						
							2012-06-25 19:00:37 +08:00  
				
					
						
							
							
								 
						
							
								d34fce56e4 
								
							 
						 
						
							
							
								
								Refs  #83  Fixed S/DGEMM calling conventions bug on windows.  
							
							
							
						 
						
							2012-06-20 19:53:18 +08:00  
				
					
						
							
							
								 
						
							
								6cfcb54a28 
								
							 
						 
						
							
							
								
								Fixed align problem in S and C precision GEMM kernels.  
							
							
							
						 
						
							2012-06-20 07:38:39 +08:00  
				
					
						
							
							
								 
						
							
								3ef96aa567 
								
							 
						 
						
							
							
								
								Fixed bug in MOVQ redefine and ALIGN SIZE problem.  
							
							
							
						 
						
							2012-06-19 20:37:22 +08:00  
				
					
						
							
							
								 
						
							
								f76f952547 
								
							 
						 
						
							
							
								
								Refs  #83   #53 . Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions.  
							
							
							
						 
						
							2012-06-19 16:37:12 +08:00  
				
					
						
							
							
								 
						
							
								eefd30881c 
								
							 
						 
						
							
							
								
								Refs  #113 . Fixed the build bug on AMD Bobcat 64-bit OS.  
							
							
							
						 
						
							2012-06-02 21:34:23 +08:00  
				
					
						
							
							
								 
						
							
								d3b67d0bd8 
								
							 
						 
						
							
							
								
								Refs  #113 . Fixed the typo BOBCATE -> BOBCAT  
							
							
							
						 
						
							2012-05-31 22:40:15 +08:00  
				
					
						
							
							
								 
						
							
								d6cab3f37e 
								
							 
						 
						
							
							
								
								Refs  #113 . Support AMD Bobcate using Barcelona kernel codes. Replace 3DNow! with MMX.  
							
							
							
						 
						
							2012-05-31 18:17:45 +08:00  
				
					
						
							
							
								 
						
							
								a53c6e2440 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into sandybridge  
							
							
							
						 
						
							2012-05-25 23:16:44 +08:00  
				
					
						
							
							
								 
						
							
								5d657c6e67 
								
							 
						 
						
							
							
								
								Fixed   #96  a SEGFAULT bug in samax on x86.  
							
							
							
						 
						
							2012-04-26 16:50:57 +08:00  
				
					
						
							
							
								 
						
							
								03b0eb19f7 
								
							 
						 
						
							
							
								
								Refs  #86 . Test alpha=Nan in x86/x86_64 dscale.  
							
							
							
						 
						
							2012-04-05 18:16:18 +08:00  
				
					
						
							
							
								 
						
							
								19a48b82cf 
								
							 
						 
						
							
							
								
								Init Sandybridge codes based on Nehalem.  
							
							
							
						 
						
							2012-03-30 20:01:03 +08:00  
				
					
						
							
							
								 
						
							
								3871b6a86d 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3b' into release-0.1.0  
							
							
							
						 
						
							2012-03-23 01:26:44 +08:00  
				
					
						
							
							
								 
						
							
								83ecfbb9b3 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' into release-0.1.0  
							
							
							
						 
						
							2012-03-23 01:26:27 +08:00  
				
					
						
							
							
								 
						
							
								dff146e306 
								
							 
						 
						
							
							
								
								refs  #80 . Used GEMV SSE2 kernels on x86.  
							
							
							
						 
						
							2012-03-19 17:56:22 +08:00  
				
					
						
							
							
								 
						
							
								8e53b57bb2 
								
							 
						 
						
							
							
								
								Appending gemmkernel and trmmkernel C code in kernel/generic, this code can be used to execute on a new platform which dose not have optimized assemble kernel.  
							
							
							
						 
						
							2012-01-10 17:16:13 +00:00  
				
					
						
							
							
								 
						
							
								66904fc4e8 
								
							 
						 
						
							
							
								
								BLAS3 used standard MIPS instructions without extensions on Loongson 3B.  
							
							
							
						 
						
							2011-11-25 11:20:25 +00:00  
				
					
						
							
							
								 
						
							
								0884f6b78d 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' of github.com:xianyi/OpenBLAS into loongson3b  
							
							
							
						 
						
							2011-11-11 14:26:49 +00:00  
				
					
						
							
							
								 
						
							
								2d78fb05c8 
								
							 
						 
						
							
							
								
								Add conjugate condition to gemv.  
							
							
							
						 
						
							2011-11-10 15:38:48 +00:00  
				
					
						
							
							
								 
						
							
								b95ad4cfaf 
								
							 
						 
						
							
							
								
								Support detecting ICT Loongson-3B CPU.  
							
							
							
						 
						
							2011-11-09 19:29:50 +00:00  
				
					
						
							
							
								 
						
							
								3bbe3ddb31 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of github.com:xianyi/OpenBLAS into loongson3b  
							
							
							
						 
						
							2011-11-09 19:08:29 +00:00  
				
					
						
							
							
								 
						
							
								a32e56500a 
								
							 
						 
						
							
							
								
								Fix the compute error of gemv when incx and incy are negative numbers.  
							
							
							
						 
						
							2011-11-04 19:32:21 +00:00  
				
					
						
							
							
								 
						
							
								c1e618ea2d 
								
							 
						 
						
							
							
								
								Add complete gemv function on Loongson3a platform.  
							
							
							
						 
						
							2011-11-03 13:53:48 +00:00  
				
					
						
							
							
								 
						
							
								19f5b5c132 
								
							 
						 
						
							
							
								
								Fixed   #66  the bug in zgemv kernel with transpose matrix on 64-bit MingW (Windows).  
							
							
							
						 
						
							2011-10-18 18:44:23 +08:00  
				
					
						
							
							
								 
						
							
								c852ce3981 
								
							 
						 
						
							
							
								
								Ref  #65 . Fixed 64-bit Windows calling convention bug in cdot and zdot.  
							
							... 
							
							
							
							According to 64-bit Windows calling convention, the return value is in %rax instead of %xmm0 in cdot kernel.
In zdot, the caller allocates a memory space for return value and sets this memory address to the first hidden parameter. Thus, the callee (zdot) should assign the result to this memory space and return the memory address in %rax. 
							
						 
						
							2011-10-18 10:23:17 +08:00  
				
					
						
							
							
								 
						
							
								e08cfaf9ca 
								
							 
						 
						
							
							
								
								Complete all the complex single-precision functions of level3, but the performance needs further improve.  
							
							
							
						 
						
							2011-09-16 17:50:40 +00:00  
				
					
						
							
							
								 
						
							
								ee4bb8bd25 
								
							 
						 
						
							
							
								
								Add ctrmm part in cgemm_kernel_loongson3a_4x2_ps.S.  
							
							
							
						 
						
							2011-09-16 16:08:39 +00:00  
				
					
						
							
							
								 
						
							
								7fa3d23dd9 
								
							 
						 
						
							
							
								
								Complete cgemm function, but no optimization.  
							
							
							
						 
						
							2011-09-15 16:08:23 +00:00  
				
					
						
							
							
								 
						
							
								9679dd077e 
								
							 
						 
						
							
							
								
								Fix some compute error.  
							
							
							
						 
						
							2011-09-14 20:00:35 +00:00  
				
					
						
							
							
								 
						
							
								7b410b7f0e 
								
							 
						 
						
							
							
								
								Fixed   #58  zdot SEGFAULT bug with GCC-4.6. Thank Mr. John for this patch.  
							
							... 
							
							
							
							In i386 calling convention, the caller put the address of return value of zdot into the first hidden parameter.
Thus, the callee should delete this address before return.
Actually, I have fixed the same bug on x86/zdot_sse2.S (issue #32 ). However, that is not a good implementation which uses 3 instructions. Mr. John told me used "ret $0x4" to skip the first hidden address (4 bytes). 
							
						 
						
							2011-09-14 23:52:51 +08:00  
				
					
						
							
							
								 
						
							
								d238a768ab 
								
							 
						 
						
							
							
								
								Use ps instructions in cgemm.  
							
							
							
						 
						
							2011-09-14 15:32:25 +00:00  
				
					
						
							
							
								 
						
							
								b1fe26c45a 
								
							 
						 
						
							
							
								
								refs  #55 . Changed  DTB_ENTRIES to DTB_DEFAULT_ENTRIES in x86 gemv_n kernel codes.  
							
							
							
						 
						
							2011-09-06 14:14:07 +08:00  
				
					
						
							
							
								 
						
							
								9fc6764fa7 
								
							 
						 
						
							
							
								
								refs  #55 . Added DTB_ENTRIES into dynamic arch setting parameters. Now, it can read DTB_ENTRIES on runtime.  
							
							
							
						 
						
							2011-09-05 17:37:07 +08:00  
				
					
						
							
							
								 
						
							
								74d4cdb81a 
								
							 
						 
						
							
							
								
								Fix an illegal instruction for strmm_RTLU.  
							
							
							
						 
						
							2011-09-02 19:41:06 +00:00  
				
					
						
							
							
								 
						
							
								7906146836 
								
							 
						 
						
							
							
								
								Fix an error for strmm_LLTN.  
							
							
							
						 
						
							2011-09-02 16:57:33 +00:00  
				
					
						
							
							
								 
						
							
								3274ff47b8 
								
							 
						 
						
							
							
								
								Fix an error for strmm_LLTN.  
							
							
							
						 
						
							2011-09-02 16:50:50 +00:00  
				
					
						
							
							
								 
						
							
								a059c553a1 
								
							 
						 
						
							
							
								
								Fix a compute error for strmm.  
							
							
							
						 
						
							2011-09-02 16:00:04 +00:00  
				
					
						
							
							
								 
						
							
								23e182ca7c 
								
							 
						 
						
							
							
								
								Fix stack-pointer bug for strmm.  
							
							
							
						 
						
							2011-09-02 15:28:01 +00:00  
				
					
						
							
							
								 
						
							
								a15bc95824 
								
							 
						 
						
							
							
								
								Add strmm part.  
							
							
							
						 
						
							2011-09-02 09:15:09 +00:00  
				
					
						
							
							
								 
						
							
								09f49fa891 
								
							 
						 
						
							
							
								
								Using PS instructions to improve the performance of sgemm and it is 4.2Gflops now.  
							
							
							
						 
						
							2011-08-31 21:24:03 +00:00  
				
					
						
							
							
								 
						
							
								cb0214787b 
								
							 
						 
						
							
							
								
								Modify compile options.  
							
							
							
						 
						
							2011-08-30 20:57:00 +00:00  
				
					
						
							
							
								 
						
							
								2e8cdd1542 
								
							 
						 
						
							
							
								
								Using ps instruction.  
							
							
							
						 
						
							2011-08-30 20:54:19 +00:00  
				
					
						
							
							
								 
						
							
								c8360e3ae5 
								
							 
						 
						
							
							
								
								Complete all the plura single precision functions of level3 on Loongson3a, the performance is 2.3GFlops.  
							
							
							
						 
						
							2011-07-18 17:03:38 +00:00  
				
					
						
							
							
								 
						
							
								68532fa9ec 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' of github.com:xianyi/OpenBLAS into loongson3a  
							
							
							
						 
						
							2011-06-24 09:28:12 +00:00  
				
					
						
							
							
								 
						
							
								708d2b6255 
								
							 
						 
						
							
							
								
								Fix compute error in ztrmm.  
							
							
							
						 
						
							2011-06-24 09:27:41 +00:00  
				
					
						
							
							
								 
						
							
								e72113f06a 
								
							 
						 
						
							
							
								
								Add ztrmm and ztrsm part on loongson3a. The average performance is 2.2G.  
							
							
							
						 
						
							2011-06-23 21:11:00 +00:00  
				
					
						
							
							
								 
						
							
								14f81da375 
								
							 
						 
						
							
							
								
								Change prefetch length of A and B, the performance is 2.1G now.  
							
							
							
						 
						
							2011-06-23 10:46:58 +00:00  
				
					
						
							
							
								 
						
							
								fc21f7ad28 
								
							 
						 
						
							
							
								
								Merge branch 'release-v0.1alpha2' into loongson3a  
							
							
							
						 
						
							2011-06-23 16:08:23 +08:00  
				
					
						
							
							
								 
						
							
								1c96d345e2 
								
							 
						 
						
							
							
								
								Improve zgemm performance from 1G to 1.8G, change block size in param.h.  
							
							
							
						 
						
							2011-06-21 22:16:23 +00:00  
				
					
						
							
							
								 
						
							
								c4efde7713 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' into release-v0.1alpha2  
							
							
							
						 
						
							2011-06-21 17:50:00 +08:00  
				
					
						
							
							
								 
						
							
								32353a9d30 
								
							 
						 
						
							
							
								
								Refs  #20 . Fixed the installation bug with DYNAMIC_ARCH=1.  
							
							
							
						 
						
							2011-06-21 17:39:08 +08:00  
				
					
						
							
							
								 
						
							
								b3d1887745 
								
							 
						 
						
							
							
								
								Fixed   #35  a build bug with NO_LAPACK=1 DYNAMIC_ARCH=1 FC=gfortran. I forgot to test it with gfortran in last bug fixed commit.  
							
							
							
						 
						
							2011-06-09 22:59:49 +08:00  
				
					
						
							
							
								 
						
							
								8d50a9fd1a 
								
							 
						 
						
							
							
								
								Fixed   #35  a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1.  
							
							
							
						 
						
							2011-06-09 11:38:59 +08:00  
				
					
						
							
							
								 
						
							
								4335bca2f7 
								
							 
						 
						
							
							
								
								Fixed   #33  ztrmm bug on Nehalem.  
							
							
							
						 
						
							2011-06-07 12:53:25 +08:00  
				
					
						
							
							
								 
						
							
								31040e4d80 
								
							 
						 
						
							
							
								
								Fixed   #32  a SEGFAULT bug with gcc-4.6. According to i386 calling convention, The called funtion should remove the hidden return value address from the stack.  
							
							
							
						 
						
							2011-06-03 13:19:54 +08:00  
				
					
						
							
							
								 
						
							
								88d94d0ec8 
								
							 
						 
						
							
							
								
								Fixed   #30  strmm computational error on Loongson3A.  
							
							
							
						 
						
							2011-05-28 09:48:34 +00:00  
				
					
						
							
							
								 
						
							
								fc84909115 
								
							 
						 
						
							
							
								
								Modify single precision compiler conditions, increasing single precision kernel code on Loongson3a.  
							
							
							
						 
						
							2011-05-27 09:47:17 +00:00  
				
					
						
							
							
								 
						
							
								5ca4e51df0 
								
							 
						 
						
							
							
								
								Remove the useless code, modify code comments and format.  
							
							
							
						 
						
							2011-05-18 10:54:51 +00:00  
				
					
						
							
							
								 
						
							
								fcb5ce011b 
								
							 
						 
						
							
							
								
								Fixed   #28 . Convert the result to double precision in MIPS64 dsdot_k kernel.  
							
							
							
						 
						
							2011-05-17 21:24:00 +00:00  
				
					
						
							
							
								 
						
							
								a9320f896e 
								
							 
						 
						
							
							
								
								Fixed   #25  dtrmm and dtrsm computational error on Loongson3A.  
							
							
							
						 
						
							2011-05-14 22:00:57 +00:00  
				
					
						
							
							
								 
						
							
								b206fc7075 
								
							 
						 
						
							
							
								
								Fixed   #28 . Convert the result to double precision in the end of dsdot kernel.  
							
							
							
						 
						
							2011-05-13 02:34:30 +08:00  
				
					
						
							
							
								 
						
							
								29dce62b8f 
								
							 
						 
						
							
							
								
								Finish dtrsm_kernel_Rx.S on Loongson3A.  
							
							
							
						 
						
							2011-05-11 10:44:23 +00:00  
				
					
						
							
							
								 
						
							
								432c309f63 
								
							 
						 
						
							
							
								
								Finish dtrsm_kernel_Lx.S on Loongson3A.  
							
							
							
						 
						
							2011-05-10 12:48:43 +00:00  
				
					
						
							
							
								 
						
							
								d2f351d819 
								
							 
						 
						
							
							
								
								Modify dtrsm compiler options  
							
							
							
						 
						
							2011-05-09 17:31:58 +00:00  
				
					
						
							
							
								 
						
							
								5a991b7149 
								
							 
						 
						
							
							
								
								Fixed   #24  drmm error on Loongson3A  
							
							
							
						 
						
							2011-05-09 17:28:20 +00:00  
				
					
						
							
							
								 
						
							
								9320933520 
								
							 
						 
						
							
							
								
								Completely dtrmm function.  
							
							
							
						 
						
							2011-04-17 20:26:49 +00:00  
				
					
						
							
							
								 
						
							
								921caefa56 
								
							 
						 
						
							
							
								
								Increased handling trmm part, no edge handling. Test size(M and N) must be a multiple of 4 .  
							
							
							
						 
						
							2011-04-15 21:56:25 +00:00  
				
					
						
							
							
								 
						
							
								ecd4c1f3d9 
								
							 
						 
						
							
							
								
								Modify prefetching C.  
							
							
							
						 
						
							2011-04-11 22:46:36 +00:00  
				
					
						
							
							
								 
						
							
								ab9e4ce351 
								
							 
						 
						
							
							
								
								Adjust kc size from 112 to 116 .  
							
							
							
						 
						
							2011-04-11 22:17:57 +00:00  
				
					
						
							
							
								 
						
							
								782205a693 
								
							 
						 
						
							
							
								
								Add dgemm compiler Options in KERNEL.LOONGSON3A.  
							
							
							
						 
						
							2011-04-06 10:38:34 +00:00  
				
					
						
							
							
								 
						
							
								ac494c0d04 
								
							 
						 
						
							
							
								
								New kernel in LOONGSON3A.  
							
							
							
						 
						
							2011-04-06 10:36:44 +00:00  
				
					
						
							
							
								 
						
							
								f405b5bcc5 
								
							 
						 
						
							
							
								
								Fixed the bug about Loongson3A gsLQC1 & gsSQC1 instructions in daxpy kernel. Now daxpy is correct.  
							
							
							
						 
						
							2011-03-18 23:05:56 +00:00  
				
					
						
							
							
								 
						
							
								d5cffd506a 
								
							 
						 
						
							
							
								
								Modified the default kernel makefile in MIPS64 arch.  
							
							
							
						 
						
							2011-03-07 11:23:12 +00:00  
				
					
						
							
							
								 
						
							
								5838f12995 
								
							 
						 
						
							
							
								
								Support unalign address in daxpy on loongson3a simd..  
							
							
							
						 
						
							2011-03-05 10:17:10 +08:00  
				
					
						
							
							
								 
						
							
								5444a3f8f7 
								
							 
						 
						
							
							
								
								Unroll to 16 in daxpy on loongson3a.  
							
							
							
						 
						
							2011-03-04 17:50:17 +08:00  
				
					
						
							
							
								 
						
							
								88cbfcc5b5 
								
							 
						 
						
							
							
								
								Merge commit 'origin/x86' into loongson3a  
							
							
							
						 
						
							2011-03-04 14:11:52 +00:00  
				
					
						
							
							
								 
						
							
								ce78abe37e 
								
							 
						 
						
							
							
								
								Merge branch 'x86' of github.com:xianyi/OpenBLAS into x86  
							
							
							
						 
						
							2011-03-04 11:53:04 +08:00  
				
					
						
							
							
								 
						
							
								8f1090d32a 
								
							 
						 
						
							
							
								
								Support NO_LAPACK=1 to build the lib without LAPACK functions.  
							
							
							
						 
						
							2011-03-04 11:51:32 +08:00  
				
					
						
							
							
								 
						
							
								272f62a2b6 
								
							 
						 
						
							
							
								
								Changed movlps macro name in capital in x86/zdot_sse2.S file.  
							
							
							
						 
						
							2011-03-03 00:46:39 +08:00  
				
					
						
							
							
								 
						
							
								36016fe349 
								
							 
						 
						
							
							
								
								On x86 32bits, gcc 4.4.3 generated wrong codes (movsd) from movlps in zdot_sse2.S line 191.  
							
							... 
							
							
							
							This would casue zdotu & zdotc failures. Instead, use movlpd to walk around it. Fixed  #8 . Fixed  #9 . 
							
						 
						
							2011-03-02 18:45:43 +08:00  
				
					
						
							
							
								 
						
							
								6eb02bbb9c 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/x86' into loongson3a  
							
							
							
						 
						
							2011-03-02 13:52:05 +08:00  
				
					
						
							
							
								 
						
							
								12214e1d0f 
								
							 
						 
						
							
							
								
								Fixed   #7 . Modified axpy kernel codes to avoid unloop with incx==0 or incy==0 in x86 32bits arch.  
							
							
							
						 
						
							2011-02-23 20:08:34 +08:00  
				
					
						
							
							
								 
						
							
								0cfd29a819 
								
							 
						 
						
							
							
								
								Fixed   #7 . 1)Disable the multi-thread and  2) Modified kernel codes to avoid unloop in axpy function when incx==0 or incy==0.  
							
							
							
						 
						
							2011-02-21 00:24:21 +08:00  
				
					
						
							
							
								 
						
							
								bfaa80c316 
								
							 
						 
						
							
							
								
								fixed   #4  csrot & drot returned the wrong result when incx==incy==0 on i686 arch.  
							
							
							
						 
						
							2011-02-18 03:00:58 +08:00  
				
					
						
							
							
								 
						
							
								c5852d4e30 
								
							 
						 
						
							
							
								
								fixed   #4  csrot returned the wrong result when incx==incy==0.  
							
							
							
						 
						
							2011-02-16 23:39:43 +08:00  
				
					
						
							
							
								 
						
							
								84ba64e65b 
								
							 
						 
						
							
							
								
								fixed a bug in drot whe incx or incy equals to zero.  
							
							
							
						 
						
							2011-02-16 23:35:41 +08:00  
				
					
						
							
							
								 
						
							
								1e671b49f3 
								
							 
						 
						
							
							
								
								Did the experiment with Loongson 3A 128bit load & store instruction.  
							
							
							
						 
						
							2011-01-29 03:05:27 +08:00  
				
					
						
							
							
								 
						
							
								77b7020d69 
								
							 
						 
						
							
							
								
								changed prefetch order.  
							
							
							
						 
						
							2011-01-29 03:03:34 +08:00  
				
					
						
							
							
								 
						
							
								e003b811ab 
								
							 
						 
						
							
							
								
								load x & y contiguously in axpy.  
							
							
							
						 
						
							2011-01-28 11:18:50 +08:00  
				
					
						
							
							
								 
						
							
								ebe2da8474 
								
							 
						 
						
							
							
								
								Modified aligned size. Added additional prefetch instruction because of cache line is 32 bytes in Loongson 3A.  
							
							
							
						 
						
							2011-01-27 23:07:06 +08:00  
				
					
						
							
							
								 
						
							
								c0b5992fab 
								
							 
						 
						
							
							
								
								added axpy kernel with prefetch for Loongson3A. To-Do: tuning prefetch distance & instruction order.  
							
							
							
						 
						
							2011-01-26 22:34:33 +08:00  
				
					
						
							
							
								 
						
							
								342bbc3871 
								
							 
						 
						
							
							
								
								Import GotoBLAS2 1.13 BSD version codes.  
							
							
							
						 
						
							2011-01-24 14:54:24 +00:00