optimizations for trmm
This commit is contained in:
		
							parent
							
								
									125610d23b
								
							
						
					
					
						commit
						1b10ff129a
					
				|  | @ -135,7 +135,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|     for(jjs = js; jjs < js + min_j; jjs += min_jj){ |     for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||||
|       min_jj = min_j + js - jjs; |       min_jj = min_j + js - jjs; | ||||||
|       if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; |       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  |       else | ||||||
|  |       	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
|       START_RPCC(); |       START_RPCC(); | ||||||
| 
 | 
 | ||||||
|  | @ -199,7 +201,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = js; jjs < js + min_j; jjs += min_jj){ |       for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||||
| 	min_jj = min_j + js - jjs; | 	min_jj = min_j + js - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; |         if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  |         else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| 	START_RPCC(); | 	START_RPCC(); | ||||||
| 
 | 
 | ||||||
|  | @ -288,7 +292,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|     for(jjs = js; jjs < js + min_j; jjs += min_jj){ |     for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||||
|       min_jj = min_j + js - jjs; |       min_jj = min_j + js - jjs; | ||||||
|       if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; |       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  |       else | ||||||
|  |         if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
|       START_RPCC(); |       START_RPCC(); | ||||||
| 
 | 
 | ||||||
|  | @ -352,7 +358,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = js; jjs < js + min_j; jjs += min_jj){ |       for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||||
| 	min_jj = min_j + js - jjs; | 	min_jj = min_j + js - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; |         if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  |         else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| 	START_RPCC(); | 	START_RPCC(); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -119,7 +119,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = 0; jjs < ls - js; jjs += min_jj){ |       for(jjs = 0; jjs < ls - js; jjs += min_jj){ | ||||||
| 	min_jj = ls - js - jjs; | 	min_jj = ls - js - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  | 	else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| #ifndef TRANSA | #ifndef TRANSA | ||||||
| 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); | 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); | ||||||
|  | @ -137,7 +139,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = 0; jjs < min_l; jjs += min_jj){ |       for(jjs = 0; jjs < min_l; jjs += min_jj){ | ||||||
| 	min_jj = min_l - jjs; | 	min_jj = min_l - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  | 	else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| #ifndef TRANSA | #ifndef TRANSA | ||||||
| 	TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); | 	TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); | ||||||
|  | @ -188,7 +192,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = js; jjs < js + min_j; jjs += min_jj){ |       for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||||
| 	min_jj = min_j + js - jjs; | 	min_jj = min_j + js - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  | 	else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| #ifndef TRANSA | #ifndef TRANSA | ||||||
| 	GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | 	GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | ||||||
|  | @ -239,7 +245,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = 0; jjs < min_l; jjs += min_jj){ |       for(jjs = 0; jjs < min_l; jjs += min_jj){ | ||||||
| 	min_jj = min_l - jjs; | 	min_jj = min_l - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  | 	else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| #ifndef TRANSA | #ifndef TRANSA | ||||||
| 	TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); | 	TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); | ||||||
|  | @ -258,7 +266,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ |       for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ | ||||||
| 	min_jj = js - ls - min_l - jjs; | 	min_jj = js - ls - min_l - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  | 	else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| #ifndef TRANSA | #ifndef TRANSA | ||||||
| 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, | 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, | ||||||
|  | @ -313,7 +323,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | ||||||
| 
 | 
 | ||||||
|       for(jjs = js; jjs < js + min_j; jjs += min_jj){ |       for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||||
| 	min_jj = min_j + js - jjs; | 	min_jj = min_j + js - jjs; | ||||||
| 	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||||
|  | 	else | ||||||
|  | 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||||
| 
 | 
 | ||||||
| #ifndef TRANSA | #ifndef TRANSA | ||||||
| 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue