optimizations for trsm
This commit is contained in:
		
							parent
							
								
									1b10ff129a
								
							
						
					
					
						commit
						3ea4dadd30
					
				| 
						 | 
					@ -128,6 +128,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
					      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
				
			||||||
	min_jj = min_j + js - jjs;
 | 
						min_jj = min_j + js - jjs;
 | 
				
			||||||
 | 
						if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
						  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
 | 
						GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
 | 
				
			||||||
| 
						 | 
					@ -194,6 +196,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
					      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
				
			||||||
	min_jj = min_j + js - jjs;
 | 
						min_jj = min_j + js - jjs;
 | 
				
			||||||
 | 
						if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
						  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
 | 
						GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -123,6 +123,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
					      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
				
			||||||
	min_jj = min_j + js - jjs;
 | 
						min_jj = min_j + js - jjs;
 | 
				
			||||||
 | 
						if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
						  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef TRANSA
 | 
					#ifndef TRANSA
 | 
				
			||||||
| 
						 | 
					@ -177,6 +179,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){
 | 
					      for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){
 | 
				
			||||||
	min_jj = min_j - min_l - ls + js - jjs;
 | 
						min_jj = min_j - min_l - ls + js - jjs;
 | 
				
			||||||
 | 
						if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
						  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef TRANSA
 | 
					#ifndef TRANSA
 | 
				
			||||||
| 
						 | 
					@ -238,6 +242,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
					      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 | 
				
			||||||
	min_jj = min_j + js - jjs;
 | 
						min_jj = min_j + js - jjs;
 | 
				
			||||||
 | 
						if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
						  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef TRANSA
 | 
					#ifndef TRANSA
 | 
				
			||||||
| 
						 | 
					@ -297,6 +303,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){
 | 
					      for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){
 | 
				
			||||||
	min_jj = min_j - js + ls - jjs;
 | 
						min_jj = min_j - js + ls - jjs;
 | 
				
			||||||
 | 
						if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
						  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef TRANSA
 | 
					#ifndef TRANSA
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue