prepared lapack/potrf functions for UNROLL values, that are not a power of two
This commit is contained in:
parent
2ffbbb54f6
commit
c81dc6322f
|
@ -239,7 +239,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
|
||||
|
||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||
buffer[i] = buffer[i - 1] + GEMM_Q * (((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N - 1) * COMPSIZE;
|
||||
buffer[i] = buffer[i - 1] + GEMM_Q * (((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N) * COMPSIZE;
|
||||
}
|
||||
|
||||
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
|
||||
|
|
|
@ -89,7 +89,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
newarg.beta = NULL;
|
||||
newarg.nthreads = args -> nthreads;
|
||||
|
||||
blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
|
||||
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
|
||||
if (blocking > GEMM_Q) blocking = GEMM_Q;
|
||||
|
||||
for (i = 0; i < n; i += blocking) {
|
||||
|
|
|
@ -89,7 +89,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
newarg.beta = NULL;
|
||||
newarg.nthreads = args -> nthreads;
|
||||
|
||||
blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
|
||||
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
|
||||
if (blocking > GEMM_Q) blocking = GEMM_Q;
|
||||
|
||||
for (i = 0; i < n; i += blocking) {
|
||||
|
|
|
@ -163,7 +163,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifdef SHARED_ARRAY
|
||||
|
|
|
@ -183,7 +183,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
|
||||
#endif
|
||||
|
||||
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||
|
@ -248,7 +248,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
|
@ -265,7 +265,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
while (current >= 0)
|
||||
#endif
|
||||
{
|
||||
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
|
@ -296,7 +296,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
|
@ -313,7 +313,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
while (current >= 0)
|
||||
#endif
|
||||
{
|
||||
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
|
@ -429,9 +429,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|||
|
||||
double di = (double)i;
|
||||
|
||||
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
|
||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||
|
||||
if (num_cpu == 0) width = n - ((n - width) & ~mask);
|
||||
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
|
@ -471,7 +471,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|||
|
||||
double di = (double)i;
|
||||
|
||||
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
|
||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
|
@ -582,7 +582,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
newarg.beta = NULL;
|
||||
newarg.nthreads = args -> nthreads;
|
||||
|
||||
blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
|
||||
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
|
||||
if (blocking > GEMM_Q) blocking = GEMM_Q;
|
||||
|
||||
for (i = 0; i < n; i += blocking) {
|
||||
|
|
Loading…
Reference in New Issue