402d6e91db 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/develop' into armv7  
							
							
							
						 
						
							2013-12-01 18:18:40 +01:00  
				
					
						
							
							
								 
						
							
								b3254eecaf 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/haswell' into develop  
							
							
							
						 
						
							2013-12-01 18:09:12 +01:00  
				
					
						
							
							
								 
						
							
								d910404f00 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/piledriver' into develop  
							
							
							
						 
						
							2013-12-01 18:06:51 +01:00  
				
					
						
							
							
								 
						
							
								ffe70b1fdc 
								
							 
						 
						
							
							
								
								modified Makefile.L3  
							
							
							
						 
						
							2013-12-01 17:58:46 +01:00  
				
					
						
							
							
								 
						
							
								0b6e13b689 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/develop' into haswell  
							
							
							
						 
						
							2013-12-01 13:38:11 +01:00  
				
					
						
							
							
								 
						
							
								e09dc279a2 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/develop' into piledriver  
							
							
							
						 
						
							2013-12-01 13:33:18 +01:00  
				
					
						
							
							
								 
						
							
								4be4db590c 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/develop' into armv7  
							
							
							
						 
						
							2013-12-01 13:16:41 +01:00  
				
					
						
							
							
								 
						
							
								5c648a8984 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/develop' into haswell  
							
							
							
						 
						
							2013-12-01 11:25:33 +01:00  
				
					
						
							
							
								 
						
							
								c44dc4dd3c 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/develop' into piledriver  
							
							
							
						 
						
							2013-12-01 11:06:36 +01:00  
				
					
						
							
							
								 
						
							
								9d3fae15a8 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into armv7  
							
							
							
						 
						
							2013-12-01 10:12:07 +01:00  
				
					
						
							
							
								 
						
							
								2d3c884294 
								
							 
						 
						
							
							
								
								added complex gemv kernels for ARMV6 and ARMV7  
							
							
							
						 
						
							2013-11-29 17:06:33 +01:00  
				
					
						
							
							
								 
						
							
								d54a061713 
								
							 
						 
						
							
							
								
								optimized gemv_n_vfp.S  
							
							
							
						 
						
							2013-11-28 17:40:21 +01:00  
				
					
						
							
							
								 
						
							
								86afb47e83 
								
							 
						 
						
							
							
								
								added optimized ctrmm kernel for ARMV6  
							
							
							
						 
						
							2013-11-28 14:35:07 +01:00  
				
					
						
							
							
								 
						
							
								42a4dff056 
								
							 
						 
						
							
							
								
								added optimized ztrmm kernel for ARMV6  
							
							
							
						 
						
							2013-11-28 13:41:06 +01:00  
				
					
						
							
							
								 
						
							
								5bc322a66c 
								
							 
						 
						
							
							
								
								optimized strmm kernel for ARMV6  
							
							
							
						 
						
							2013-11-28 12:45:38 +01:00  
				
					
						
							
							
								 
						
							
								dec7ad0dfd 
								
							 
						 
						
							
							
								
								optimized dtrmm kernel for ARMV7  
							
							
							
						 
						
							2013-11-28 12:32:12 +01:00  
				
					
						
							
							
								 
						
							
								274304bd03 
								
							 
						 
						
							
							
								
								add optimized cgemm kernel for ARMV6  
							
							
							
						 
						
							2013-11-28 11:54:38 +01:00  
				
					
						
							
							
								 
						
							
								5007a534c4 
								
							 
						 
						
							
							
								
								optimized zgemm kernel for ARMV6  
							
							
							
						 
						
							2013-11-28 10:04:43 +01:00  
				
					
						
							
							
								 
						
							
								a537d7d8d7 
								
							 
						 
						
							
							
								
								optimized zgemm_kernel_2x2_vfp.S  
							
							
							
						 
						
							2013-11-28 08:33:44 +01:00  
				
					
						
							
							
								 
						
							
								b42145834f 
								
							 
						 
						
							
							
								
								optimized sgemm kernel for ARMV6  
							
							
							
						 
						
							2013-11-28 08:08:08 +01:00  
				
					
						
							
							
								 
						
							
								3d5e792c72 
								
							 
						 
						
							
							
								
								optimized sgemm kernel for ARMV6  
							
							
							
						 
						
							2013-11-27 18:38:32 +01:00  
				
					
						
							
							
								 
						
							
								a9bd12da2c 
								
							 
						 
						
							
							
								
								optimized dgemm kernel for ARMV6  
							
							
							
						 
						
							2013-11-27 17:37:38 +01:00  
				
					
						
							
							
								 
						
							
								697e198e8a 
								
							 
						 
						
							
							
								
								added zgemm_kernel for ARMV6  
							
							
							
						 
						
							2013-11-27 16:15:06 +01:00  
				
					
						
							
							
								 
						
							
								36b0f7fe1d 
								
							 
						 
						
							
							
								
								added optimized gemv_t kernel for ARMV6  
							
							
							
						 
						
							2013-11-25 19:31:27 +01:00  
				
					
						
							
							
								 
						
							
								d2b20c5c51 
								
							 
						 
						
							
							
								
								add optimized axpy kernel  
							
							
							
						 
						
							2013-11-25 12:25:58 +01:00  
				
					
						
							
							
								 
						
							
								fe5f46c330 
								
							 
						 
						
							
							
								
								added experimental support for ARMV8  
							
							
							
						 
						
							2013-11-24 15:47:00 +01:00  
				
					
						
							
							
								 
						
							
								25c6050593 
								
							 
						 
						
							
							
								
								add single and double precision gemv_n kernel  for ARMV6  
							
							
							
						 
						
							2013-11-24 12:03:28 +01:00  
				
					
						
							
							
								 
						
							
								12e02a00e0 
								
							 
						 
						
							
							
								
								added ncopy kernels for ARMV6  
							
							
							
						 
						
							2013-11-24 08:46:47 +01:00  
				
					
						
							
							
								 
						
							
								29a3196f56 
								
							 
						 
						
							
							
								
								added optimized sgemm and strmm kernel for ARMV6  
							
							
							
						 
						
							2013-11-23 18:09:41 +01:00  
				
					
						
							
							
								 
						
							
								8776a73773 
								
							 
						 
						
							
							
								
								added optimized dgemm and dtrmm kernel for ARMV6  
							
							
							
						 
						
							2013-11-23 16:24:52 +01:00  
				
					
						
							
							
								 
						
							
								7e84acd3e8 
								
							 
						 
						
							
							
								
								fixed bug in SAVE macros, that are not found by any test routine  
							
							
							
						 
						
							2013-11-23 14:35:19 +01:00  
				
					
						
							
							
								 
						
							
								33d3ab6e09 
								
							 
						 
						
							
							
								
								small optimizations for zgemv kernels  
							
							
							
						 
						
							2013-11-23 12:35:31 +01:00  
				
					
						
							
							
								 
						
							
								9a0f978929 
								
							 
						 
						
							
							
								
								added nrm2 kernel for ARMV6  
							
							
							
						 
						
							2013-11-22 17:21:10 +01:00  
				
					
						
							
							
								 
						
							
								7f210587f0 
								
							 
						 
						
							
							
								
								renamed some ncopy and tcopy files  
							
							
							
						 
						
							2013-11-22 00:20:25 +01:00  
				
					
						
							
							
								 
						
							
								9f0a3a35b3 
								
							 
						 
						
							
							
								
								removed obsolete file sdot_vfpv3.S  
							
							
							
						 
						
							2013-11-21 23:42:54 +01:00  
				
					
						
							
							
								 
						
							
								dbae93110b 
								
							 
						 
						
							
							
								
								added sdot_vfp.S  
							
							
							
						 
						
							2013-11-21 23:34:51 +01:00  
				
					
						
							
							
								 
						
							
								19cd5c64a2 
								
							 
						 
						
							
							
								
								renamed swap_vfpv3.S to swap_vfp.S  
							
							
							
						 
						
							2013-11-21 23:19:32 +01:00  
				
					
						
							
							
								 
						
							
								9adf87495e 
								
							 
						 
						
							
							
								
								renamed some dot kernels  
							
							
							
						 
						
							2013-11-21 23:07:51 +01:00  
				
					
						
							
							
								 
						
							
								440db4cdda 
								
							 
						 
						
							
							
								
								delete rot_vfpv3.S  
							
							
							
						 
						
							2013-11-21 22:52:24 +01:00  
				
					
						
							
							
								 
						
							
								cd93cae5a7 
								
							 
						 
						
							
							
								
								renamed rot_vfpv3.S to rot_vfp.S  
							
							
							
						 
						
							2013-11-21 22:49:28 +01:00  
				
					
						
							
							
								 
						
							
								8565afb3c2 
								
							 
						 
						
							
							
								
								renamed asum_vfpv3.S to asum_vfp.S  
							
							
							
						 
						
							2013-11-21 22:26:27 +01:00  
				
					
						
							
							
								 
						
							
								5bf7cf8d67 
								
							 
						 
						
							
							
								
								renamed scal_vfpv3.S to scal_vfp.S  
							
							
							
						 
						
							2013-11-21 22:03:36 +01:00  
				
					
						
							
							
								 
						
							
								29a005c635 
								
							 
						 
						
							
							
								
								renamed iamax assembler kernel  
							
							
							
						 
						
							2013-11-21 21:12:33 +01:00  
				
					
						
							
							
								 
						
							
								f1be3a168a 
								
							 
						 
						
							
							
								
								renamed some BLAS kernels, which are compatible to ARMV6  
							
							
							
						 
						
							2013-11-21 20:48:57 +01:00  
				
					
						
							
							
								 
						
							
								410afda9b4 
								
							 
						 
						
							
							
								
								added cpu detection and target ARMV6,  used in raspberry pi  
							
							
							
						 
						
							2013-11-21 20:18:51 +01:00  
				
					
						
							
							
								 
						
							
								bf04544902 
								
							 
						 
						
							
							
								
								added gemv_n kernel for single and double precision  
							
							
							
						 
						
							2013-11-19 15:07:20 +01:00  
				
					
						
							
							
								 
						
							
								86283c0be1 
								
							 
						 
						
							
							
								
								added gemv_t kernel for single and double precision  
							
							
							
						 
						
							2013-11-19 09:55:54 +01:00  
				
					
						
							
							
								 
						
							
								f27cabfd08 
								
							 
						 
						
							
							
								
								added nrm2 kernel for all precisions  
							
							
							
						 
						
							2013-11-16 16:17:17 +01:00  
				
					
						
							
							
								 
						
							
								23dd474cd0 
								
							 
						 
						
							
							
								
								added rot kernel for all precisions  
							
							
							
						 
						
							2013-11-15 14:08:57 +01:00  
				
					
						
							
							
								 
						
							
								f1b452e160 
								
							 
						 
						
							
							
								
								added scal kernel for all precisions  
							
							
							
						 
						
							2013-11-15 11:56:43 +01:00  
				
					
						
							
							
								 
						
							
								3dabd7e6e6 
								
							 
						 
						
							
							
								
								added swap-kernel for all precisions  
							
							
							
						 
						
							2013-11-14 19:06:19 +01:00  
				
					
						
							
							
								 
						
							
								6f4a0ebe38 
								
							 
						 
						
							
							
								
								added max- und min-kernels for all precisions  
							
							
							
						 
						
							2013-11-14 13:52:47 +01:00  
				
					
						
							
							
								 
						
							
								f1db386211 
								
							 
						 
						
							
							
								
								changes for compatibility with Pathscale compiler  
							
							
							
						 
						
							2013-11-13 17:59:11 +01:00  
				
					
						
							
							
								 
						
							
								6da558d2ab 
								
							 
						 
						
							
							
								
								changes for compatibility with Pathscale compiler  
							
							
							
						 
						
							2013-11-13 17:39:13 +01:00  
				
					
						
							
							
								 
						
							
								f750103336 
								
							 
						 
						
							
							
								
								small optimizations on dot-kernels  
							
							
							
						 
						
							2013-11-11 15:47:56 +01:00  
				
					
						
							
							
								 
						
							
								00f33c0134 
								
							 
						 
						
							
							
								
								added asum_kernel for all precisions and complex  
							
							
							
						 
						
							2013-11-11 14:20:59 +01:00  
				
					
						
							
							
								 
						
							
								5b36cc0f47 
								
							 
						 
						
							
							
								
								added blas level1 dot kernels for complex and double complex  
							
							
							
						 
						
							2013-11-08 09:08:11 +01:00  
				
					
						
							
							
								 
						
							
								c8f1aeb154 
								
							 
						 
						
							
							
								
								added optimized blas level1 dot kernels for single and double precision  
							
							
							
						 
						
							2013-11-07 17:22:03 +01:00  
				
					
						
							
							
								 
						
							
								8fa93be06e 
								
							 
						 
						
							
							
								
								added optimized blas level1 copy kernels  
							
							
							
						 
						
							2013-11-07 17:18:56 +01:00  
				
					
						
							
							
								 
						
							
								1e8128f41c 
								
							 
						 
						
							
							
								
								added cgemm_tcopy_2_vfpv3.S and zgemm_tcopy_2_vfpv3.S  
							
							
							
						 
						
							2013-11-07 17:15:50 +01:00  
				
					
						
							
							
								 
						
							
								2f5fdd2000 
								
							 
						 
						
							
							
								
								Refs  #314 . Fixed clang compiling bug on OSX.  
							
							
							
						 
						
							2013-11-07 08:12:03 +08:00  
				
					
						
							
							
								 
						
							
								80a2e901b1 
								
							 
						 
						
							
							
								
								added dgemm_tcopy_4_vfpv3.S and sgemm_tcopy_4_vfpv3.S  
							
							
							
						 
						
							2013-11-06 20:01:18 +01:00  
				
					
						
							
							
								 
						
							
								ac50bccbd2 
								
							 
						 
						
							
							
								
								added cgemm_ncopy_2_vfpv3.S and made assembler labels unique  
							
							
							
						 
						
							2013-11-05 20:21:35 +01:00  
				
					
						
							
							
								 
						
							
								82015beaef 
								
							 
						 
						
							
							
								
								added zgemm_ncopy_2_vfpv3.S and made assembler labels unique  
							
							
							
						 
						
							2013-11-05 19:31:22 +01:00  
				
					
						
							
							
								 
						
							
								6216ab8a7e 
								
							 
						 
						
							
							
								
								removed obsolete gemm_kernels from haswell branch  
							
							
							
						 
						
							2013-11-04 08:33:04 +01:00  
				
					
						
							
							
								 
						
							
								370e3834a9 
								
							 
						 
						
							
							
								
								added missing file kernel/arm/Makefile  
							
							
							
						 
						
							2013-11-03 11:54:39 +01:00  
				
					
						
							
							
								 
						
							
								e31186efd4 
								
							 
						 
						
							
							
								
								deleted obsolete dgemm_kernel and dtrmm_kernel  
							
							
							
						 
						
							2013-11-02 13:12:21 +01:00  
				
					
						
							
							
								 
						
							
								2b801a00a5 
								
							 
						 
						
							
							
								
								small optimizations on sgemm_kernel for ARMV7  
							
							
							
						 
						
							2013-11-02 13:06:11 +01:00  
				
					
						
							
							
								 
						
							
								b3eab8fcb7 
								
							 
						 
						
							
							
								
								minor optimizations on zgemm_kernel for ARMV7  
							
							
							
						 
						
							2013-11-02 09:43:53 +01:00  
				
					
						
							
							
								 
						
							
								02bc36ac79 
								
							 
						 
						
							
							
								
								added sgemm_ncopy routine and made some improvements on cgemm_kernel for ARMV7  
							
							
							
						 
						
							2013-11-01 18:22:27 +01:00  
				
					
						
							
							
								 
						
							
								5118a7f4d1 
								
							 
						 
						
							
							
								
								small optimizations on dgemm_kernel for Piledriver  
							
							
							
						 
						
							2013-10-31 11:53:26 +01:00  
				
					
						
							
							
								 
						
							
								e172b70ea2 
								
							 
						 
						
							
							
								
								added cgemm_kernel for Piledriver  
							
							
							
						 
						
							2013-10-31 08:38:17 +01:00  
				
					
						
							
							
								 
						
							
								1cf4b974b2 
								
							 
						 
						
							
							
								
								added zgemm_kernel for Piledriver  
							
							
							
						 
						
							2013-10-30 09:12:17 +01:00  
				
					
						
							
							
								 
						
							
								7bccff1512 
								
							 
						 
						
							
							
								
								added sgemm_kernel for PILEDRIVER  
							
							
							
						 
						
							2013-10-29 22:53:04 +01:00  
				
					
						
							
							
								 
						
							
								afe44b0241 
								
							 
						 
						
							
							
								
								tests and code cleanup of gemm_kernels for HASWELL  
							
							
							
						 
						
							2013-10-28 14:23:48 +01:00  
				
					
						
							
							
								 
						
							
								a77c71eaf5 
								
							 
						 
						
							
							
								
								added highly optimized dgemm_kernel for HASWELL  
							
							
							
						 
						
							2013-10-28 10:23:47 +01:00  
				
					
						
							
							
								 
						
							
								fe8c5666f9 
								
							 
						 
						
							
							
								
								optimized dgemm_kernel for HASWELL  
							
							
							
						 
						
							2013-10-20 16:52:26 +02:00  
				
					
						
							
							
								 
						
							
								f6b50057e2 
								
							 
						 
						
							
							
								
								corrected and testet FMA3 Code  
							
							
							
						 
						
							2013-10-19 10:52:20 +02:00  
				
					
						
							
							
								 
						
							
								2840d56aeb 
								
							 
						 
						
							
							
								
								added dgemm_kernel for Piledriver  
							
							
							
						 
						
							2013-10-19 09:47:15 +02:00  
				
					
						
							
							
								 
						
							
								85484a42df 
								
							 
						 
						
							
							
								
								added kernels for cgemm, ctrmm, zgemm and ztrmm  
							
							
							
						 
						
							2013-10-16 18:00:41 +02:00  
				
					
						
							
							
								 
						
							
								3983011f0b 
								
							 
						 
						
							
							
								
								added sgemm- and strmm_kernel  
							
							
							
						 
						
							2013-10-14 08:22:27 +02:00  
				
					
						
							
							
								 
						
							
								2a1515c9dd 
								
							 
						 
						
							
							
								
								added dgemm_ncopy_4_vfpv3.S  
							
							
							
						 
						
							2013-10-12 16:48:29 +02:00  
				
					
						
							
							
								 
						
							
								31f51e78bc 
								
							 
						 
						
							
							
								
								minor optimizations on dgemm_kernel  
							
							
							
						 
						
							2013-10-12 09:42:18 +02:00  
				
					
						
							
							
								 
						
							
								beffee7d91 
								
							 
						 
						
							
							
								
								Fixed buffer overflow bug in kernel/x86_64/dgemv_t.S file.  
							
							
							
						 
						
							2013-10-11 03:20:20 +08:00  
				
					
						
							
							
								 
						
							
								e0b968c3a7 
								
							 
						 
						
							
							
								
								Changed kernels for dgemm and dtrmm  
							
							
							
						 
						
							2013-10-05 12:59:44 +02:00  
				
					
						
							
							
								 
						
							
								1c63180bb6 
								
							 
						 
						
							
							
								
								updated dgemm_kernel_8x2_vfpv3.S  
							
							
							
						 
						
							2013-09-30 17:31:23 +02:00  
				
					
						
							
							
								 
						
							
								4a474ea7dc 
								
							 
						 
						
							
							
								
								changed dgemm_kernel to use fused multiply add  
							
							
							
						 
						
							2013-09-29 17:46:23 +02:00  
				
					
						
							
							
								 
						
							
								69ce737cc5 
								
							 
						 
						
							
							
								
								modified Makefile.L3 for ARM  
							
							
							
						 
						
							2013-09-28 19:13:47 +02:00  
				
					
						
							
							
								 
						
							
								70411af888 
								
							 
						 
						
							
							
								
								initial checkin of kernel/arm  
							
							
							
						 
						
							2013-09-28 19:02:25 +02:00  
				
					
						
							
							
								 
						
							
								6c4a7d0828 
								
							 
						 
						
							
							
								
								Import AMD Piledriver DGEMM kernel generated by AUGEM.  
							
							... 
							
							
							
							So far, this kernel doesn't deal with edge.
AUGEM: Automatically Generate High Performance Dense Linear Algebra
Kernels on x86 CPUs.
Qian Wang, Xianyi Zhang, Yunquan Zhang, and Qing Yi. In the
International Conference for High Performance Computing, Networking,
Storage and Analysis (SC'13). Denver, CO. Nov, 2013. 
							
						 
						
							2013-08-25 10:16:01 -03:00  
				
					
						
							
							
								 
						
							
								067e8417fd 
								
							 
						 
						
							
							
								
								removed unnessesary instructions from zgemm_kernel_2x2_bulldozer.S  
							
							
							
						 
						
							2013-08-23 22:22:43 +08:00  
				
					
						
							
							
								 
						
							
								a82da3d069 
								
							 
						 
						
							
							
								
								removed unnessesary instructions  
							
							
							
						 
						
							2013-08-23 22:22:27 +08:00  
				
					
						
							
							
								 
						
							
								1569bf14f8 
								
							 
						 
						
							
							
								
								Refs  #282 . Fixed zgemv_n typo bug on Win64.  
							
							
							
						 
						
							2013-08-23 16:27:17 +08:00  
				
					
						
							
							
								 
						
							
								f51a849d91 
								
							 
						 
						
							
							
								
								Merge pull request  #278  from wernsaar/haswell  
							
							... 
							
							
							
							Merge wernsaar's Haswell gemm kernels. 
							
						 
						
							2013-08-17 08:24:37 -07:00  
				
					
						
							
							
								 
						
							
								44ef70420c 
								
							 
						 
						
							
							
								
								added cgemm_kernel_8x2_haswell.S  
							
							
							
						 
						
							2013-08-16 18:54:56 +02:00  
				
					
						
							
							
								 
						
							
								d488b1b1aa 
								
							 
						 
						
							
							
								
								added zgemm_kernel_4x2_haswell.S  
							
							
							
						 
						
							2013-08-16 10:29:47 +02:00  
				
					
						
							
							
								 
						
							
								4070d9a123 
								
							 
						 
						
							
							
								
								added dgemm_kernel_16x2_haswell.S  
							
							
							
						 
						
							2013-08-15 19:17:20 +02:00  
				
					
						
							
							
								 
						
							
								0b90c0ec64 
								
							 
						 
						
							
							
								
								added sgemm_kernel_16x4_haswell.S  
							
							
							
						 
						
							2013-08-15 18:46:14 +02:00  
				
					
						
							
							
								 
						
							
								2b8ab8f55b 
								
							 
						 
						
							
							
								
								sgemm_kernel_16x4_haswell.S minor changes  
							
							
							
						 
						
							2013-08-14 01:44:41 +02:00  
				
					
						
							
							
								 
						
							
								1cb9579cd0 
								
							 
						 
						
							
							
								
								added zgemm_kernel_4x2_haswell.S and fixed a bug in sgemm_kernel_16x4_haswell.S  
							
							
							
						 
						
							2013-08-14 01:23:15 +02:00  
				
					
						
							
							
								 
						
							
								2638370844 
								
							 
						 
						
							
							
								
								Init code base for Intel Haswell.  
							
							
							
						 
						
							2013-08-13 00:54:59 +08:00  
				
					
						
							
							
								 
						
							
								89637f87c8 
								
							 
						 
						
							
							
								
								added sgemm- and dgemm-kernel for HASWELL processor  
							
							
							
						 
						
							2013-08-12 18:04:10 +02:00  
				
					
						
							
							
								 
						
							
								c0159d44a3 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of  https://github.com/wernsaar/OpenBLAS  into wernsaar-develop  
							
							
							
						 
						
							2013-08-09 10:48:46 +08:00  
				
					
						
							
							
								 
						
							
								c17a850c1c 
								
							 
						 
						
							
							
								
								modified KERNEL.BULLDOZER  
							
							
							
						 
						
							2013-08-08 17:49:30 +02:00  
				
					
						
							
							
								 
						
							
								099853fff6 
								
							 
						 
						
							
							
								
								added dtrsm_kernel_RN_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-08 07:14:08 +02:00  
				
					
						
							
							
								 
						
							
								44d23881b5 
								
							 
						 
						
							
							
								
								dtrsm_kernel_LT_8x2_bulldozer.S performance optimization  
							
							
							
						 
						
							2013-08-05 11:27:16 +02:00  
				
					
						
							
							
								 
						
							
								32fb6b9bb2 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of  https://github.com/wernsaar/OpenBLAS  into wernsaar-develop  
							
							
							
						 
						
							2013-08-05 16:09:47 +08:00  
				
					
						
							
							
								 
						
							
								aaeb8eaecd 
								
							 
						 
						
							
							
								
								modified dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-04 12:16:12 +02:00  
				
					
						
							
							
								 
						
							
								8aeec32ea0 
								
							 
						 
						
							
							
								
								modified dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-04 10:15:33 +02:00  
				
					
						
							
							
								 
						
							
								87fc9de572 
								
							 
						 
						
							
							
								
								added dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-04 09:54:40 +02:00  
				
					
						
							
							
								 
						
							
								564aa60fec 
								
							 
						 
						
							
							
								
								removed dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 15:40:51 +02:00  
				
					
						
							
							
								 
						
							
								f645665dd6 
								
							 
						 
						
							
							
								
								fixed bug in dgemv_t_bulldozer.S  
							
							
							
						 
						
							2013-08-03 12:19:29 +02:00  
				
					
						
							
							
								 
						
							
								e45a347cd2 
								
							 
						 
						
							
							
								
								repaired trmm bug in sgemm_kernel_16x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 11:43:25 +02:00  
				
					
						
							
							
								 
						
							
								99727ac013 
								
							 
						 
						
							
							
								
								repaired trmm bug in cgemm_kernel_4x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 10:32:51 +02:00  
				
					
						
							
							
								 
						
							
								6e0a2fbc0c 
								
							 
						 
						
							
							
								
								repaired trmm bug in zgemm_kernel_2x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 10:17:08 +02:00  
				
					
						
							
							
								 
						
							
								0a22f99c58 
								
							 
						 
						
							
							
								
								repaired trmm bug in dgemm_kernel_8x2_bulldozer.S  
							
							
							
						 
						
							2013-08-03 09:35:39 +02:00  
				
					
						
							
							
								 
						
							
								cff70a666d 
								
							 
						 
						
							
							
								
								added generic trmm kernels and modified Makefile.L3  
							
							
							
						 
						
							2013-07-30 20:18:57 +02:00  
				
					
						
							
							
								 
						
							
								84bd0aabaa 
								
							 
						 
						
							
							
								
								added dtrsm_kernel_LT_8x2_bulldozer.S  
							
							
							
						 
						
							2013-07-28 16:47:58 +02:00  
				
					
						
							
							
								 
						
							
								72b1edaf1b 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into bulldozer  
							
							... 
							
							
							
							Conflicts:
	kernel/x86_64/KERNEL.BULLDOZER 
							
						 
						
							2013-07-28 06:38:25 +02:00  
				
					
						
							
							
								 
						
							
								1b3b9e841d 
								
							 
						 
						
							
							
								
								Fixed a computational error in zgemm_kernel_4x4_sandy.S file.  
							
							
							
						 
						
							2013-07-18 20:23:21 +08:00  
				
					
						
							
							
								 
						
							
								2ed0f6ab60 
								
							 
						 
						
							
							
								
								Fixed the typo.  
							
							
							
						 
						
							2013-07-11 23:47:07 +08:00  
				
					
						
							
							
								 
						
							
								886cbaf4e4 
								
							 
						 
						
							
							
								
								Support AMD Piledriver by bulldozer kernels.  
							
							
							
						 
						
							2013-07-06 12:06:43 -03:00  
				
					
						
							
							
								 
						
							
								57944538b6 
								
							 
						 
						
							
							
								
								Use ALIGN_5 instead of .algin 32 in assembly kernel. Added ALIGN_5 for 32-bit OSX.  
							
							
							
						 
						
							2013-07-01 16:09:05 +08:00  
				
					
						
							
							
								 
						
							
								fa916a0fac 
								
							 
						 
						
							
							
								
								Fixed   #238  bug in lsame on x86.  
							
							
							
						 
						
							2013-06-28 22:43:41 +08:00  
				
					
						
							
							
								 
						
							
								fb298b34ae 
								
							 
						 
						
							
							
								
								Merge pull request  #235  from wernsaar/develop  
							
							... 
							
							
							
							Added ddot, daxpy, dcopy kernels for AMD bulldozer. 
							
						 
						
							2013-06-21 17:59:26 -07:00  
				
					
						
							
							
								 
						
							
								16012767f4 
								
							 
						 
						
							
							
								
								added dcopy_bulldozer.S  
							
							
							
						 
						
							2013-06-21 16:06:51 +02:00  
				
					
						
							
							
								 
						
							
								bcbac31b47 
								
							 
						 
						
							
							
								
								added ddot_bulldozer.S  
							
							
							
						 
						
							2013-06-20 16:15:09 +02:00  
				
					
						
							
							
								 
						
							
								8dc0c72583 
								
							 
						 
						
							
							
								
								added daxpy_bulldozer.S  
							
							
							
						 
						
							2013-06-20 14:07:54 +02:00  
				
					
						
							
							
								 
						
							
								89405a1a0b 
								
							 
						 
						
							
							
								
								cleanup of dgemm_ncopy_8_bulldozer.S  
							
							
							
						 
						
							2013-06-19 19:31:38 +02:00  
				
					
						
							
							
								 
						
							
								4f2b12b8a8 
								
							 
						 
						
							
							
								
								added dgemv_t_bulldozer.S  
							
							
							
						 
						
							2013-06-19 17:32:42 +02:00  
				
					
						
							
							
								 
						
							
								646e168d26 
								
							 
						 
						
							
							
								
								Merge pull request  #233  from wernsaar/develop  
							
							... 
							
							
							
							added dgemv_n and some faster gemm_copy routines to BULLDOZER. 
							
						 
						
							2013-06-18 20:02:36 -07:00  
				
					
						
							
							
								 
						
							
								93dbbe1fb8 
								
							 
						 
						
							
							
								
								added dgemm_ncopy_8_bulldozer.S  
							
							
							
						 
						
							2013-06-18 13:29:23 +02:00  
				
					
						
							
							
								 
						
							
								a135f5d9ed 
								
							 
						 
						
							
							
								
								added gemm_tcopy_2_bulldozer.S  
							
							
							
						 
						
							2013-06-18 11:01:33 +02:00  
				
					
						
							
							
								 
						
							
								d0b6299b13 
								
							 
						 
						
							
							
								
								added dgemm_tcopy_8_bulldozer.S  
							
							
							
						 
						
							2013-06-17 14:19:09 +02:00  
				
					
						
							
							
								 
						
							
								9e58dd509e 
								
							 
						 
						
							
							
								
								added gemm_ncopy_2_bulldozer.S  
							
							
							
						 
						
							2013-06-17 12:55:12 +02:00  
				
					
						
							
							
								 
						
							
								7c8227101b 
								
							 
						 
						
							
							
								
								cleanup of dgemv_n_bulldozer.S and optimization of inner loop  
							
							
							
						 
						
							2013-06-16 12:50:45 +02:00  
				
					
						
							
							
								 
						
							
								f67fa62851 
								
							 
						 
						
							
							
								
								added dgemv_n_bulldozer.S  
							
							
							
						 
						
							2013-06-15 16:42:37 +02:00  
				
					
						
							
							
								 
						
							
								cd1d473ba0 
								
							 
						 
						
							
							
								
								Merge pull request  #230  from wernsaar/develop  
							
							... 
							
							
							
							Refs #230 . New dgemm and sgemm Kernel for BULLDOZER 
							
						 
						
							2013-06-13 07:29:27 -07:00  
				
					
						
							
							
								 
						
							
								0ded1fcc1c 
								
							 
						 
						
							
							
								
								performance optimizations in sgemm_kernel_16x2_bulldozer.S  
							
							
							
						 
						
							2013-06-13 11:35:15 +02:00  
				
					
						
							
							
								 
						
							
								a789b588cd 
								
							 
						 
						
							
							
								
								added cgemm_kernel_4x2_bulldozer.S  
							
							
							
						 
						
							2013-06-12 15:55:27 +02:00  
				
					
						
							
							
								 
						
							
								8eaa04acbb 
								
							 
						 
						
							
							
								
								added zgemm_kernel_2x2_bulldozer.S  
							
							
							
						 
						
							2013-06-11 12:00:49 +02:00  
				
					
						
							
							
								 
						
							
								d854b30ae6 
								
							 
						 
						
							
							
								
								Added UNROLL values for 3M to getarch_2nd.c, Makefile.system and Makefile.L3  
							
							
							
						 
						
							2013-06-09 17:26:42 +02:00  
				
					
						
							
							
								 
						
							
								d65bbec99b 
								
							 
						 
						
							
							
								
								added new sgemm kernel for BULLDOZER  
							
							
							
						 
						
							2013-06-09 15:57:42 +02:00  
				
					
						
							
							
								 
						
							
								e4c39c7c26 
								
							 
						 
						
							
							
								
								changed stack touching  
							
							
							
						 
						
							2013-06-08 10:43:08 +02:00  
				
					
						
							
							
								 
						
							
								25491e42f9 
								
							 
						 
						
							
							
								
								New dgemm kernel for BULLDOZER: dgemm_kernel_8x2_bulldozer.S  
							
							
							
						 
						
							2013-06-08 09:40:17 +02:00  
				
					
						
							
							
								 
						
							
								9f59f384d8 
								
							 
						 
						
							
							
								
								Refs  #223 . Fixed s/dgemv bug on windows.  
							
							
							
						 
						
							2013-06-04 16:01:05 +08:00  
				
					
						
							
							
								 
						
							
								23965f164c 
								
							 
						 
						
							
							
								
								Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86_64.  
							
							
							
						 
						
							2013-05-29 19:48:31 +08:00  
				
					
						
							
							
								 
						
							
								6a72840945 
								
							 
						 
						
							
							
								
								Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86.  
							
							
							
						 
						
							2013-05-29 13:23:12 +08:00  
				
					
						
							
							
								 
						
							
								69aa6c8fb1 
								
							 
						 
						
							
							
								
								bad performance with some data  
							
							
							
						 
						
							2013-04-28 11:14:23 +02:00  
				
					
						
							
							
								 
						
							
								60b263f3d2 
								
							 
						 
						
							
							
								
								removed trsm_kernel_RT_4x4_bulldozer.S. wrong results  
							
							
							
						 
						
							2013-04-27 17:23:08 +02:00  
				
					
						
							
							
								 
						
							
								7ac306e0da 
								
							 
						 
						
							
							
								
								added trsm_kernel_RT_4x4_bulldozer.S  
							
							
							
						 
						
							2013-04-27 16:48:48 +02:00  
				
					
						
							
							
								 
						
							
								4cb454cdf2 
								
							 
						 
						
							
							
								
								added trsm_kernel_LT_4x4_bulldozer.S  
							
							
							
						 
						
							2013-04-27 14:30:00 +02:00  
				
					
						
							
							
								 
						
							
								19ad2fb128 
								
							 
						 
						
							
							
								
								prefetch improved. Defined 2 different kernels for inner loop  
							
							
							
						 
						
							2013-04-27 13:40:49 +02:00  
				
					
						
							
							
								 
						
							
								6821677489 
								
							 
						 
						
							
							
								
								minor improvements and code cleanup  
							
							
							
						 
						
							2013-04-26 20:05:42 +02:00  
				
					
						
							
							
								 
						
							
								3326f3152c 
								
							 
						 
						
							
							
								
								Merge pull request  #213  from wernsaar/develop  
							
							... 
							
							
							
							Merged some improvements into dgemm_kernel_4x4_bulldozer.S. 
							
						 
						
							2013-04-17 23:56:09 -07:00  
				
					
						
							
							
								 
						
							
								7641f6e253 
								
							 
						 
						
							
							
								
								Merged some improvements into dgemm_kernel_4x4_bulldozer.S.  
							
							... 
							
							
							
							Changed the copy functions to generic to solve prefetch conflicts 
							
						 
						
							2013-04-16 19:05:06 +02:00  
				
					
						
							
							
								 
						
							
								3ad29452d1 
								
							 
						 
						
							
							
								
								Merge pull request  #211  from wernsaar/develop  
							
							... 
							
							
							
							New version of dgemm_kernel_4x4_bulldozer.S 
							
						 
						
							2013-04-15 00:20:55 -07:00  
				
					
						
							
							
								 
						
							
								6e3f6f25a5 
								
							 
						 
						
							
							
								
								New version of dgemm_kernel_4x4_bulldozer.S  
							
							... 
							
							
							
							The peak performance with 8 cores is now 90 GFlops 
							
						 
						
							2013-04-12 17:55:51 +02:00  
				
					
						
							
							
								 
						
							
								724ae159ce 
								
							 
						 
						
							
							
								
								Fixed the Windows x86_64 ABI bug in s/daxpy kernels.  
							
							
							
						 
						
							2013-03-08 22:28:34 +08:00  
				
					
						
							
							
								 
						
							
								f300ce3df5 
								
							 
						 
						
							
							
								
								new optimization of dgemm kernel for bulldozer: 10% performance increase  
							
							
							
						 
						
							2013-03-06 17:26:03 +01:00  
				
					
						
							
							
								 
						
							
								66e64131ed 
								
							 
						 
						
							
							
								
								optimized again bulldozer dgemm kernel  
							
							
							
						 
						
							2013-03-05 19:51:37 +01:00  
				
					
						
							
							
								 
						
							
								9405f26f4b 
								
							 
						 
						
							
							
								
								new dgemm_kernel for bulldozer  
							
							
							
						 
						
							2013-03-04 17:37:38 +01:00  
				
					
						
							
							
								 
						
							
								5c8bf6ae0e 
								
							 
						 
						
							
							
								
								Merge branch 'bulldozer' into develop  
							
							
							
						 
						
							2013-02-10 01:19:42 +08:00  
				
					
						
							
							
								 
						
							
								a1ead62f28 
								
							 
						 
						
							
							
								
								Disable the warning of sgemm bulldozer kernel.  
							
							
							
						 
						
							2013-02-09 17:03:13 +01:00  
				
					
						
							
							
								 
						
							
								0133580148 
								
							 
						 
						
							
							
								
								Used sgemm bulldozer kernel on 64 bit.  
							
							
							
						 
						
							2013-02-09 16:29:14 +01:00  
				
					
						
							
							
								 
						
							
								274246651d 
								
							 
						 
						
							
							
								
								Merge branch 'bulldozer' of git://github.com/wernsaar/OpenBLAS into bulldozer  
							
							
							
						 
						
							2013-02-09 16:25:07 +01:00  
				
					
						
							
							
								 
						
							
								299b5a44dc 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of github.com:xianyi/OpenBLAS into bulldozer  
							
							
							
						 
						
							2013-02-09 16:22:04 +01:00  
				
					
						
							
							
								 
						
							
								d311236dfd 
								
							 
						 
						
							
							
								
								Refs  #189 . Fixed the bug of s/cdot about invalid reading NAN on x86_64.  
							
							
							
						 
						
							2013-01-25 20:56:14 +08:00  
				
					
						
							
							
								 
						
							
								0b08f7479e 
								
							 
						 
						
							
							
								
								Refs  #154 . Fixed gemv_t bug about overflow 16MB buffer on x86.  
							
							
							
						 
						
							2013-01-20 21:22:12 +08:00  
				
					
						
							
							
								 
						
							
								99d1978df7 
								
							 
						 
						
							
							
								
								Fixed   #180 . the typos in kernel/x86_64/sgemv_t.S  
							
							
							
						 
						
							2013-01-12 12:31:14 +08:00  
				
					
						
							
							
								 
						
							
								08bf6674d5 
								
							 
						 
						
							
							
								
								Refs  #177 . Fixed sgemv_t compiling bug on Win64.  
							
							
							
						 
						
							2013-01-05 11:36:39 +08:00  
				
					
						
							
							
								 
						
							
								69200884e1 
								
							 
						 
						
							
							
								
								Refs  #173 . Fixed overflow internal buffer bug of gemv_n on x86  
							
							
							
						 
						
							2012-12-25 09:27:49 +08:00  
				
					
						
							
							
								 
						
							
								0d1518add9 
								
							 
						 
						
							
							
								
								Refs  #173 . Fixed overflow internal buffer bug of sgemv_t on x86  
							
							
							
						 
						
							2012-12-25 09:10:17 +08:00  
				
					
						
							
							
								 
						
							
								91ed4e4450 
								
							 
						 
						
							
							
								
								Refs  #171 . Prevent loading the dirty number from the buffer in sgemv_t x86 kernel.  
							
							
							
						 
						
							2012-12-23 23:14:17 +08:00  
				
					
						
							
							
								 
						
							
								fd3046b32a 
								
							 
						 
						
							
							
								
								Refs  #173 . Fixed overflow internal buffer bug of gemv_t on x86.  
							
							
							
						 
						
							2012-12-23 21:47:22 +08:00  
				
					
						
							
							
								 
						
							
								9fb341a9f8 
								
							 
						 
						
							
							
								
								set parameters for CORE_ATHLON  
							
							... 
							
							
							
							else dgemm_p is set to zero leading to a segfault in alloc_mmap due to
allocsize being zero 
							
						 
						
							2012-12-15 16:05:33 +01:00  
				
					
						
							
							
								 
						
							
								d48cff8cf1 
								
							 
						 
						
							
							
								
								Added optimized sgemm_kernel  
							
							
							
						 
						
							2012-12-08 18:50:53 +01:00  
				
					
						
							
							
								 
						
							
								f19af5ecc0 
								
							 
						 
						
							
							
								
								Refs  #54 . Added AMD Bulldozer x86_64 dgemm kernel developed by Werner Saar <wernsaar at googlemail.com>  
							
							... 
							
							
							
							Based on the dgemm kernel for AMD Barcelona, he used AVX and FMA4 instructions.
Thank Werner Saar! 
							
						 
						
							2012-12-07 01:05:11 +08:00  
				
					
						
							
							
								 
						
							
								bfaaa975e6 
								
							 
						 
						
							
							
								
								Added BULLDOZER target. So far it uses barcelona kernels.  
							
							
							
						 
						
							2012-12-07 00:53:31 +08:00  
				
					
						
							
							
								 
						
							
								b7c0fa6bd2 
								
							 
						 
						
							
							
								
								Init AMD Bulldozer codebase.  
							
							
							
						 
						
							2012-12-06 07:29:54 -05:00  
				
					
						
							
							
								 
						
							
								cea1a885b5 
								
							 
						 
						
							
							
								
								Refs  #154 . Fixed the build bug of dgemv_t on MinW64.  
							
							
							
						 
						
							2012-11-27 07:24:04 +08:00  
				
					
						
							
							
								 
						
							
								5f0117385e 
								
							 
						 
						
							
							
								
								Refs  #154 . Fixed a SEGFAULT bug of dgemv_t when m is very large.  
							
							... 
							
							
							
							It overflowed the internal buffer. Thus, we split vector x into blocks when m is very large.
Thank @wangqian for this patch. 
							
						 
						
							2012-11-19 22:32:27 +08:00  
				
					
						
							
							
								 
						
							
								2573311308 
								
							 
						 
						
							
							
								
								refs  #140 . Fixed zdot incompatibility ABI issue with GCC 4.7 on Win 32.  
							
							... 
							
							
							
							GCC 4.7 uses MSVC ABI on Win 32. This means the caller pops the hidden pointer for returning
aggregate structures larger than 8 bytes. 
							
						 
						
							2012-09-24 20:34:33 +08:00  
				
					
						
							
							
								 
						
							
								d0e731e8b8 
								
							 
						 
						
							
							
								
								provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make on the command line  
							
							
							
						 
						
							2012-08-21 00:31:12 -04:00  
				
					
						
							
							
								 
						
							
								25f1a573fd 
								
							 
						 
						
							
							
								
								Fixed the build bug when DYNAMIC_ARCH=0.  
							
							
							
						 
						
							2012-07-07 12:12:24 +08:00  
				
					
						
							
							
								 
						
							
								857a0fa0df 
								
							 
						 
						
							
							
								
								Fixed the issue of mixing AVX and SSE codes in S/D/C/ZGEMM.  
							
							
							
						 
						
							2012-06-25 19:00:37 +08:00  
				
					
						
							
							
								 
						
							
								d34fce56e4 
								
							 
						 
						
							
							
								
								Refs  #83  Fixed S/DGEMM calling conventions bug on windows.  
							
							
							
						 
						
							2012-06-20 19:53:18 +08:00  
				
					
						
							
							
								 
						
							
								6cfcb54a28 
								
							 
						 
						
							
							
								
								Fixed align problem in S and C precision GEMM kernels.  
							
							
							
						 
						
							2012-06-20 07:38:39 +08:00  
				
					
						
							
							
								 
						
							
								3ef96aa567 
								
							 
						 
						
							
							
								
								Fixed bug in MOVQ redefine and ALIGN SIZE problem.  
							
							
							
						 
						
							2012-06-19 20:37:22 +08:00  
				
					
						
							
							
								 
						
							
								f76f952547 
								
							 
						 
						
							
							
								
								Refs  #83   #53 . Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions.  
							
							
							
						 
						
							2012-06-19 16:37:12 +08:00  
				
					
						
							
							
								 
						
							
								eefd30881c 
								
							 
						 
						
							
							
								
								Refs  #113 . Fixed the build bug on AMD Bobcat 64-bit OS.  
							
							
							
						 
						
							2012-06-02 21:34:23 +08:00  
				
					
						
							
							
								 
						
							
								d3b67d0bd8 
								
							 
						 
						
							
							
								
								Refs  #113 . Fixed the typo BOBCATE -> BOBCAT  
							
							
							
						 
						
							2012-05-31 22:40:15 +08:00  
				
					
						
							
							
								 
						
							
								d6cab3f37e 
								
							 
						 
						
							
							
								
								Refs  #113 . Support AMD Bobcate using Barcelona kernel codes. Replace 3DNow! with MMX.  
							
							
							
						 
						
							2012-05-31 18:17:45 +08:00  
				
					
						
							
							
								 
						
							
								a53c6e2440 
								
							 
						 
						
							
							
								
								Merge branch 'develop' into sandybridge  
							
							
							
						 
						
							2012-05-25 23:16:44 +08:00  
				
					
						
							
							
								 
						
							
								5d657c6e67 
								
							 
						 
						
							
							
								
								Fixed   #96  a SEGFAULT bug in samax on x86.  
							
							
							
						 
						
							2012-04-26 16:50:57 +08:00  
				
					
						
							
							
								 
						
							
								03b0eb19f7 
								
							 
						 
						
							
							
								
								Refs  #86 . Test alpha=Nan in x86/x86_64 dscale.  
							
							
							
						 
						
							2012-04-05 18:16:18 +08:00  
				
					
						
							
							
								 
						
							
								19a48b82cf 
								
							 
						 
						
							
							
								
								Init Sandybridge codes based on Nehalem.  
							
							
							
						 
						
							2012-03-30 20:01:03 +08:00  
				
					
						
							
							
								 
						
							
								3871b6a86d 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3b' into release-0.1.0  
							
							
							
						 
						
							2012-03-23 01:26:44 +08:00  
				
					
						
							
							
								 
						
							
								83ecfbb9b3 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' into release-0.1.0  
							
							
							
						 
						
							2012-03-23 01:26:27 +08:00  
				
					
						
							
							
								 
						
							
								dff146e306 
								
							 
						 
						
							
							
								
								refs  #80 . Used GEMV SSE2 kernels on x86.  
							
							
							
						 
						
							2012-03-19 17:56:22 +08:00  
				
					
						
							
							
								 
						
							
								8e53b57bb2 
								
							 
						 
						
							
							
								
								Appending gemmkernel and trmmkernel C code in kernel/generic, this code can be used to execute on a new platform which dose not have optimized assemble kernel.  
							
							
							
						 
						
							2012-01-10 17:16:13 +00:00  
				
					
						
							
							
								 
						
							
								66904fc4e8 
								
							 
						 
						
							
							
								
								BLAS3 used standard MIPS instructions without extensions on Loongson 3B.  
							
							
							
						 
						
							2011-11-25 11:20:25 +00:00  
				
					
						
							
							
								 
						
							
								0884f6b78d 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' of github.com:xianyi/OpenBLAS into loongson3b  
							
							
							
						 
						
							2011-11-11 14:26:49 +00:00  
				
					
						
							
							
								 
						
							
								2d78fb05c8 
								
							 
						 
						
							
							
								
								Add conjugate condition to gemv.  
							
							
							
						 
						
							2011-11-10 15:38:48 +00:00  
				
					
						
							
							
								 
						
							
								b95ad4cfaf 
								
							 
						 
						
							
							
								
								Support detecting ICT Loongson-3B CPU.  
							
							
							
						 
						
							2011-11-09 19:29:50 +00:00  
				
					
						
							
							
								 
						
							
								3bbe3ddb31 
								
							 
						 
						
							
							
								
								Merge branch 'develop' of github.com:xianyi/OpenBLAS into loongson3b  
							
							
							
						 
						
							2011-11-09 19:08:29 +00:00  
				
					
						
							
							
								 
						
							
								a32e56500a 
								
							 
						 
						
							
							
								
								Fix the compute error of gemv when incx and incy are negative numbers.  
							
							
							
						 
						
							2011-11-04 19:32:21 +00:00  
				
					
						
							
							
								 
						
							
								c1e618ea2d 
								
							 
						 
						
							
							
								
								Add complete gemv function on Loongson3a platform.  
							
							
							
						 
						
							2011-11-03 13:53:48 +00:00  
				
					
						
							
							
								 
						
							
								19f5b5c132 
								
							 
						 
						
							
							
								
								Fixed   #66  the bug in zgemv kernel with transpose matrix on 64-bit MingW (Windows).  
							
							
							
						 
						
							2011-10-18 18:44:23 +08:00  
				
					
						
							
							
								 
						
							
								c852ce3981 
								
							 
						 
						
							
							
								
								Ref  #65 . Fixed 64-bit Windows calling convention bug in cdot and zdot.  
							
							... 
							
							
							
							According to 64-bit Windows calling convention, the return value is in %rax instead of %xmm0 in cdot kernel.
In zdot, the caller allocates a memory space for return value and sets this memory address to the first hidden parameter. Thus, the callee (zdot) should assign the result to this memory space and return the memory address in %rax. 
							
						 
						
							2011-10-18 10:23:17 +08:00  
				
					
						
							
							
								 
						
							
								e08cfaf9ca 
								
							 
						 
						
							
							
								
								Complete all the complex single-precision functions of level3, but the performance needs further improve.  
							
							
							
						 
						
							2011-09-16 17:50:40 +00:00  
				
					
						
							
							
								 
						
							
								ee4bb8bd25 
								
							 
						 
						
							
							
								
								Add ctrmm part in cgemm_kernel_loongson3a_4x2_ps.S.  
							
							
							
						 
						
							2011-09-16 16:08:39 +00:00  
				
					
						
							
							
								 
						
							
								7fa3d23dd9 
								
							 
						 
						
							
							
								
								Complete cgemm function, but no optimization.  
							
							
							
						 
						
							2011-09-15 16:08:23 +00:00  
				
					
						
							
							
								 
						
							
								9679dd077e 
								
							 
						 
						
							
							
								
								Fix some compute error.  
							
							
							
						 
						
							2011-09-14 20:00:35 +00:00  
				
					
						
							
							
								 
						
							
								7b410b7f0e 
								
							 
						 
						
							
							
								
								Fixed   #58  zdot SEGFAULT bug with GCC-4.6. Thank Mr. John for this patch.  
							
							... 
							
							
							
							In i386 calling convention, the caller put the address of return value of zdot into the first hidden parameter.
Thus, the callee should delete this address before return.
Actually, I have fixed the same bug on x86/zdot_sse2.S (issue #32 ). However, that is not a good implementation which uses 3 instructions. Mr. John told me used "ret $0x4" to skip the first hidden address (4 bytes). 
							
						 
						
							2011-09-14 23:52:51 +08:00  
				
					
						
							
							
								 
						
							
								d238a768ab 
								
							 
						 
						
							
							
								
								Use ps instructions in cgemm.  
							
							
							
						 
						
							2011-09-14 15:32:25 +00:00  
				
					
						
							
							
								 
						
							
								b1fe26c45a 
								
							 
						 
						
							
							
								
								refs  #55 . Changed  DTB_ENTRIES to DTB_DEFAULT_ENTRIES in x86 gemv_n kernel codes.  
							
							
							
						 
						
							2011-09-06 14:14:07 +08:00  
				
					
						
							
							
								 
						
							
								9fc6764fa7 
								
							 
						 
						
							
							
								
								refs  #55 . Added DTB_ENTRIES into dynamic arch setting parameters. Now, it can read DTB_ENTRIES on runtime.  
							
							
							
						 
						
							2011-09-05 17:37:07 +08:00  
				
					
						
							
							
								 
						
							
								74d4cdb81a 
								
							 
						 
						
							
							
								
								Fix an illegal instruction for strmm_RTLU.  
							
							
							
						 
						
							2011-09-02 19:41:06 +00:00  
				
					
						
							
							
								 
						
							
								7906146836 
								
							 
						 
						
							
							
								
								Fix an error for strmm_LLTN.  
							
							
							
						 
						
							2011-09-02 16:57:33 +00:00  
				
					
						
							
							
								 
						
							
								3274ff47b8 
								
							 
						 
						
							
							
								
								Fix an error for strmm_LLTN.  
							
							
							
						 
						
							2011-09-02 16:50:50 +00:00  
				
					
						
							
							
								 
						
							
								a059c553a1 
								
							 
						 
						
							
							
								
								Fix a compute error for strmm.  
							
							
							
						 
						
							2011-09-02 16:00:04 +00:00  
				
					
						
							
							
								 
						
							
								23e182ca7c 
								
							 
						 
						
							
							
								
								Fix stack-pointer bug for strmm.  
							
							
							
						 
						
							2011-09-02 15:28:01 +00:00  
				
					
						
							
							
								 
						
							
								a15bc95824 
								
							 
						 
						
							
							
								
								Add strmm part.  
							
							
							
						 
						
							2011-09-02 09:15:09 +00:00  
				
					
						
							
							
								 
						
							
								09f49fa891 
								
							 
						 
						
							
							
								
								Using PS instructions to improve the performance of sgemm and it is 4.2Gflops now.  
							
							
							
						 
						
							2011-08-31 21:24:03 +00:00  
				
					
						
							
							
								 
						
							
								cb0214787b 
								
							 
						 
						
							
							
								
								Modify compile options.  
							
							
							
						 
						
							2011-08-30 20:57:00 +00:00  
				
					
						
							
							
								 
						
							
								2e8cdd1542 
								
							 
						 
						
							
							
								
								Using ps instruction.  
							
							
							
						 
						
							2011-08-30 20:54:19 +00:00  
				
					
						
							
							
								 
						
							
								c8360e3ae5 
								
							 
						 
						
							
							
								
								Complete all the plura single precision functions of level3 on Loongson3a, the performance is 2.3GFlops.  
							
							
							
						 
						
							2011-07-18 17:03:38 +00:00  
				
					
						
							
							
								 
						
							
								68532fa9ec 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' of github.com:xianyi/OpenBLAS into loongson3a  
							
							
							
						 
						
							2011-06-24 09:28:12 +00:00  
				
					
						
							
							
								 
						
							
								708d2b6255 
								
							 
						 
						
							
							
								
								Fix compute error in ztrmm.  
							
							
							
						 
						
							2011-06-24 09:27:41 +00:00  
				
					
						
							
							
								 
						
							
								e72113f06a 
								
							 
						 
						
							
							
								
								Add ztrmm and ztrsm part on loongson3a. The average performance is 2.2G.  
							
							
							
						 
						
							2011-06-23 21:11:00 +00:00  
				
					
						
							
							
								 
						
							
								14f81da375 
								
							 
						 
						
							
							
								
								Change prefetch length of A and B, the performance is 2.1G now.  
							
							
							
						 
						
							2011-06-23 10:46:58 +00:00  
				
					
						
							
							
								 
						
							
								fc21f7ad28 
								
							 
						 
						
							
							
								
								Merge branch 'release-v0.1alpha2' into loongson3a  
							
							
							
						 
						
							2011-06-23 16:08:23 +08:00  
				
					
						
							
							
								 
						
							
								1c96d345e2 
								
							 
						 
						
							
							
								
								Improve zgemm performance from 1G to 1.8G, change block size in param.h.  
							
							
							
						 
						
							2011-06-21 22:16:23 +00:00  
				
					
						
							
							
								 
						
							
								c4efde7713 
								
							 
						 
						
							
							
								
								Merge branch 'loongson3a' into release-v0.1alpha2  
							
							
							
						 
						
							2011-06-21 17:50:00 +08:00  
				
					
						
							
							
								 
						
							
								32353a9d30 
								
							 
						 
						
							
							
								
								Refs  #20 . Fixed the installation bug with DYNAMIC_ARCH=1.  
							
							
							
						 
						
							2011-06-21 17:39:08 +08:00  
				
					
						
							
							
								 
						
							
								b3d1887745 
								
							 
						 
						
							
							
								
								Fixed   #35  a build bug with NO_LAPACK=1 DYNAMIC_ARCH=1 FC=gfortran. I forgot to test it with gfortran in last bug fixed commit.  
							
							
							
						 
						
							2011-06-09 22:59:49 +08:00  
				
					
						
							
							
								 
						
							
								8d50a9fd1a 
								
							 
						 
						
							
							
								
								Fixed   #35  a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1.  
							
							
							
						 
						
							2011-06-09 11:38:59 +08:00  
				
					
						
							
							
								 
						
							
								4335bca2f7 
								
							 
						 
						
							
							
								
								Fixed   #33  ztrmm bug on Nehalem.  
							
							
							
						 
						
							2011-06-07 12:53:25 +08:00  
				
					
						
							
							
								 
						
							
								31040e4d80 
								
							 
						 
						
							
							
								
								Fixed   #32  a SEGFAULT bug with gcc-4.6. According to i386 calling convention, The called funtion should remove the hidden return value address from the stack.  
							
							
							
						 
						
							2011-06-03 13:19:54 +08:00  
				
					
						
							
							
								 
						
							
								88d94d0ec8 
								
							 
						 
						
							
							
								
								Fixed   #30  strmm computational error on Loongson3A.  
							
							
							
						 
						
							2011-05-28 09:48:34 +00:00  
				
					
						
							
							
								 
						
							
								fc84909115 
								
							 
						 
						
							
							
								
								Modify single precision compiler conditions, increasing single precision kernel code on Loongson3a.  
							
							
							
						 
						
							2011-05-27 09:47:17 +00:00  
				
					
						
							
							
								 
						
							
								5ca4e51df0 
								
							 
						 
						
							
							
								
								Remove the useless code, modify code comments and format.  
							
							
							
						 
						
							2011-05-18 10:54:51 +00:00  
				
					
						
							
							
								 
						
							
								fcb5ce011b 
								
							 
						 
						
							
							
								
								Fixed   #28 . Convert the result to double precision in MIPS64 dsdot_k kernel.  
							
							
							
						 
						
							2011-05-17 21:24:00 +00:00  
				
					
						
							
							
								 
						
							
								a9320f896e 
								
							 
						 
						
							
							
								
								Fixed   #25  dtrmm and dtrsm computational error on Loongson3A.  
							
							
							
						 
						
							2011-05-14 22:00:57 +00:00  
				
					
						
							
							
								 
						
							
								b206fc7075 
								
							 
						 
						
							
							
								
								Fixed   #28 . Convert the result to double precision in the end of dsdot kernel.  
							
							
							
						 
						
							2011-05-13 02:34:30 +08:00  
				
					
						
							
							
								 
						
							
								29dce62b8f 
								
							 
						 
						
							
							
								
								Finish dtrsm_kernel_Rx.S on Loongson3A.  
							
							
							
						 
						
							2011-05-11 10:44:23 +00:00  
				
					
						
							
							
								 
						
							
								432c309f63 
								
							 
						 
						
							
							
								
								Finish dtrsm_kernel_Lx.S on Loongson3A.  
							
							
							
						 
						
							2011-05-10 12:48:43 +00:00  
				
					
						
							
							
								 
						
							
								d2f351d819 
								
							 
						 
						
							
							
								
								Modify dtrsm compiler options  
							
							
							
						 
						
							2011-05-09 17:31:58 +00:00  
				
					
						
							
							
								 
						
							
								5a991b7149 
								
							 
						 
						
							
							
								
								Fixed   #24  drmm error on Loongson3A  
							
							
							
						 
						
							2011-05-09 17:28:20 +00:00  
				
					
						
							
							
								 
						
							
								9320933520 
								
							 
						 
						
							
							
								
								Completely dtrmm function.  
							
							
							
						 
						
							2011-04-17 20:26:49 +00:00  
				
					
						
							
							
								 
						
							
								921caefa56 
								
							 
						 
						
							
							
								
								Increased handling trmm part, no edge handling. Test size(M and N) must be a multiple of 4 .  
							
							
							
						 
						
							2011-04-15 21:56:25 +00:00  
				
					
						
							
							
								 
						
							
								ecd4c1f3d9 
								
							 
						 
						
							
							
								
								Modify prefetching C.  
							
							
							
						 
						
							2011-04-11 22:46:36 +00:00  
				
					
						
							
							
								 
						
							
								ab9e4ce351 
								
							 
						 
						
							
							
								
								Adjust kc size from 112 to 116 .  
							
							
							
						 
						
							2011-04-11 22:17:57 +00:00  
				
					
						
							
							
								 
						
							
								782205a693 
								
							 
						 
						
							
							
								
								Add dgemm compiler Options in KERNEL.LOONGSON3A.  
							
							
							
						 
						
							2011-04-06 10:38:34 +00:00  
				
					
						
							
							
								 
						
							
								ac494c0d04 
								
							 
						 
						
							
							
								
								New kernel in LOONGSON3A.  
							
							
							
						 
						
							2011-04-06 10:36:44 +00:00  
				
					
						
							
							
								 
						
							
								f405b5bcc5 
								
							 
						 
						
							
							
								
								Fixed the bug about Loongson3A gsLQC1 & gsSQC1 instructions in daxpy kernel. Now daxpy is correct.  
							
							
							
						 
						
							2011-03-18 23:05:56 +00:00  
				
					
						
							
							
								 
						
							
								d5cffd506a 
								
							 
						 
						
							
							
								
								Modified the default kernel makefile in MIPS64 arch.  
							
							
							
						 
						
							2011-03-07 11:23:12 +00:00  
				
					
						
							
							
								 
						
							
								5838f12995 
								
							 
						 
						
							
							
								
								Support unalign address in daxpy on loongson3a simd..  
							
							
							
						 
						
							2011-03-05 10:17:10 +08:00  
				
					
						
							
							
								 
						
							
								5444a3f8f7 
								
							 
						 
						
							
							
								
								Unroll to 16 in daxpy on loongson3a.  
							
							
							
						 
						
							2011-03-04 17:50:17 +08:00  
				
					
						
							
							
								 
						
							
								88cbfcc5b5 
								
							 
						 
						
							
							
								
								Merge commit 'origin/x86' into loongson3a  
							
							
							
						 
						
							2011-03-04 14:11:52 +00:00  
				
					
						
							
							
								 
						
							
								ce78abe37e 
								
							 
						 
						
							
							
								
								Merge branch 'x86' of github.com:xianyi/OpenBLAS into x86  
							
							
							
						 
						
							2011-03-04 11:53:04 +08:00  
				
					
						
							
							
								 
						
							
								8f1090d32a 
								
							 
						 
						
							
							
								
								Support NO_LAPACK=1 to build the lib without LAPACK functions.  
							
							
							
						 
						
							2011-03-04 11:51:32 +08:00  
				
					
						
							
							
								 
						
							
								272f62a2b6 
								
							 
						 
						
							
							
								
								Changed movlps macro name in capital in x86/zdot_sse2.S file.  
							
							
							
						 
						
							2011-03-03 00:46:39 +08:00  
				
					
						
							
							
								 
						
							
								36016fe349 
								
							 
						 
						
							
							
								
								On x86 32bits, gcc 4.4.3 generated wrong codes (movsd) from movlps in zdot_sse2.S line 191.  
							
							... 
							
							
							
							This would casue zdotu & zdotc failures. Instead, use movlpd to walk around it. Fixed  #8 . Fixed  #9 . 
							
						 
						
							2011-03-02 18:45:43 +08:00  
				
					
						
							
							
								 
						
							
								6eb02bbb9c 
								
							 
						 
						
							
							
								
								Merge remote branch 'origin/x86' into loongson3a  
							
							
							
						 
						
							2011-03-02 13:52:05 +08:00  
				
					
						
							
							
								 
						
							
								12214e1d0f 
								
							 
						 
						
							
							
								
								Fixed   #7 . Modified axpy kernel codes to avoid unloop with incx==0 or incy==0 in x86 32bits arch.  
							
							
							
						 
						
							2011-02-23 20:08:34 +08:00  
				
					
						
							
							
								 
						
							
								0cfd29a819 
								
							 
						 
						
							
							
								
								Fixed   #7 . 1)Disable the multi-thread and  2) Modified kernel codes to avoid unloop in axpy function when incx==0 or incy==0.  
							
							
							
						 
						
							2011-02-21 00:24:21 +08:00  
				
					
						
							
							
								 
						
							
								bfaa80c316 
								
							 
						 
						
							
							
								
								fixed   #4  csrot & drot returned the wrong result when incx==incy==0 on i686 arch.  
							
							
							
						 
						
							2011-02-18 03:00:58 +08:00  
				
					
						
							
							
								 
						
							
								c5852d4e30 
								
							 
						 
						
							
							
								
								fixed   #4  csrot returned the wrong result when incx==incy==0.  
							
							
							
						 
						
							2011-02-16 23:39:43 +08:00  
				
					
						
							
							
								 
						
							
								84ba64e65b 
								
							 
						 
						
							
							
								
								fixed a bug in drot whe incx or incy equals to zero.  
							
							
							
						 
						
							2011-02-16 23:35:41 +08:00  
				
					
						
							
							
								 
						
							
								1e671b49f3 
								
							 
						 
						
							
							
								
								Did the experiment with Loongson 3A 128bit load & store instruction.  
							
							
							
						 
						
							2011-01-29 03:05:27 +08:00  
				
					
						
							
							
								 
						
							
								77b7020d69 
								
							 
						 
						
							
							
								
								changed prefetch order.  
							
							
							
						 
						
							2011-01-29 03:03:34 +08:00  
				
					
						
							
							
								 
						
							
								e003b811ab 
								
							 
						 
						
							
							
								
								load x & y contiguously in axpy.  
							
							
							
						 
						
							2011-01-28 11:18:50 +08:00  
				
					
						
							
							
								 
						
							
								ebe2da8474 
								
							 
						 
						
							
							
								
								Modified aligned size. Added additional prefetch instruction because of cache line is 32 bytes in Loongson 3A.  
							
							
							
						 
						
							2011-01-27 23:07:06 +08:00  
				
					
						
							
							
								 
						
							
								c0b5992fab 
								
							 
						 
						
							
							
								
								added axpy kernel with prefetch for Loongson3A. To-Do: tuning prefetch distance & instruction order.  
							
							
							
						 
						
							2011-01-26 22:34:33 +08:00  
				
					
						
							
							
								 
						
							
								342bbc3871 
								
							 
						 
						
							
							
								
								Import GotoBLAS2 1.13 BSD version codes.  
							
							
							
						 
						
							2011-01-24 14:54:24 +00:00