Merge f1aaf0777a
into 0ab5bf1746
This commit is contained in:
commit
83b0712602
|
@ -237,7 +237,7 @@ int main(int argc, char *argv[]){
|
|||
timeg = time1/loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
|
||||
COMPSIZE * COMPSIZE * (2.*(double)k+2.) * (double)m * (double)n / timeg * 1.e-6, time1);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -801,33 +801,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
i = (m & 1);
|
||||
if (i > 0) {
|
||||
if (X > posY) {
|
||||
/* a01 += 2;
|
||||
a02 += 2; */
|
||||
b += 4;
|
||||
} else
|
||||
#ifdef UNIT
|
||||
if (X < posY) {
|
||||
#endif
|
||||
b[ 0] = *(a01 + 0);
|
||||
b[ 1] = *(a01 + 1);
|
||||
b[ 2] = *(a01 + 2);
|
||||
b[ 3] = *(a01 + 3);
|
||||
|
||||
/* a01 += lda;
|
||||
a02 += lda; */
|
||||
b += 4;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
} else {
|
||||
b[ 0] = ONE;
|
||||
b[ 1] = ZERO;
|
||||
#else
|
||||
b[ 0] = *(a01 + 0);
|
||||
b[ 1] = *(a01 + 1);
|
||||
}
|
||||
#endif
|
||||
b[ 2] = *(a01 + 2);
|
||||
b[ 3] = *(a01 + 3);
|
||||
b += 4;
|
||||
}
|
||||
}
|
||||
posY += 2;
|
||||
}
|
||||
|
||||
|
|
|
@ -301,7 +301,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
|||
dtrmm_kernel_4x8( temp, &alpha , ptrba, ptrbb, C0, C1, C2, C3, C4, C5, C6, C7);
|
||||
|
||||
ptrba = ptrba + temp * 4;
|
||||
ptrbb = ptrbb + temp * 8;
|
||||
// ptrbb = ptrbb + temp * 8;
|
||||
|
||||
/*
|
||||
for (k=0; k<temp; k++)
|
||||
|
@ -446,7 +446,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
|||
temp - 8; // number of values in B
|
||||
|
||||
ptrba += temp*4; // number of values in A
|
||||
ptrbb += temp*8; // number of values in B
|
||||
// ptrbb += temp*8; // number of values in B
|
||||
}
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
|
@ -709,14 +709,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+1;
|
||||
/* C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
C2 = C2+1;
|
||||
C3 = C3+1;
|
||||
C4 = C4+1;
|
||||
C5 = C5+1;
|
||||
C6 = C6+1;
|
||||
C7 = C7+1;
|
||||
C7 = C7+1; */
|
||||
|
||||
}
|
||||
|
||||
|
@ -862,7 +862,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
|||
temp - 4; // number of values in B */
|
||||
|
||||
ptrba += temp*4; // number of values in A
|
||||
ptrbb += temp*4; // number of values in B
|
||||
// ptrbb += temp*4; // number of values in B
|
||||
}
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
|
@ -1049,10 +1049,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+1;
|
||||
/* C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
C2 = C2+1;
|
||||
C3 = C3+1;
|
||||
C3 = C3+1; */
|
||||
|
||||
}
|
||||
|
||||
|
@ -1311,8 +1311,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
/* C0 = C0+1;
|
||||
C1 = C1+1; */
|
||||
|
||||
}
|
||||
|
||||
|
@ -1532,7 +1532,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+1;
|
||||
// C0 = C0+1;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -124,13 +124,13 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
|||
min_jj = js + min_j - jjs;
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
if (0 && GEMM_UNROLL_N <= 8) {
|
||||
/* if (0 && GEMM_UNROLL_N <= 8) {
|
||||
|
||||
LASWP_NCOPY(min_jj, off + 1, off + k,
|
||||
c + (- off + jjs * lda) * COMPSIZE, lda,
|
||||
ipiv, sbb + k * (jjs - js) * COMPSIZE);
|
||||
|
||||
} else {
|
||||
} else { */
|
||||
|
||||
LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
|
||||
#ifdef COMPLEX
|
||||
|
@ -140,7 +140,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
|||
|
||||
GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE);
|
||||
|
||||
}
|
||||
// }
|
||||
|
||||
for (is = 0; is < k; is += GEMM_P) {
|
||||
min_i = k - is;
|
||||
|
@ -251,14 +251,14 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
if (0 && GEMM_UNROLL_N <= 8) {
|
||||
/* if (0 && GEMM_UNROLL_N <= 8) {
|
||||
printf("helllo\n");
|
||||
|
||||
LASWP_NCOPY(min_jj, off + 1, off + k,
|
||||
b + (- off + jjs * lda) * COMPSIZE, lda,
|
||||
ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE);
|
||||
|
||||
} else {
|
||||
} else { */
|
||||
|
||||
LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
|
||||
#ifdef COMPLEX
|
||||
|
@ -268,7 +268,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
|
||||
GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda,
|
||||
buffer[bufferside] + (jjs - xxx) * k * COMPSIZE);
|
||||
}
|
||||
// }
|
||||
|
||||
for (is = 0; is < k; is += GEMM_P) {
|
||||
min_i = k - is;
|
||||
|
|
|
@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
BLASLONG n, info;
|
||||
BLASLONG bk, i, blocking, start_i;
|
||||
int mode;
|
||||
BLASLONG lda, range_N[2];
|
||||
BLASLONG lda;//, range_N[2];
|
||||
blas_arg_t newarg;
|
||||
FLOAT *a;
|
||||
FLOAT alpha[2] = { ONE, ZERO};
|
||||
|
@ -100,8 +100,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
bk = n - i;
|
||||
if (bk > blocking) bk = blocking;
|
||||
|
||||
range_N[0] = i;
|
||||
range_N[1] = i + bk;
|
||||
/* range_N[0] = i;
|
||||
range_N[1] = i + bk; */
|
||||
|
||||
newarg.lda = lda;
|
||||
newarg.ldb = lda;
|
||||
|
|
|
@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
BLASLONG n, info;
|
||||
BLASLONG bk, i, blocking;
|
||||
int mode;
|
||||
BLASLONG lda, range_N[2];
|
||||
BLASLONG lda; // , range_N[2];
|
||||
blas_arg_t newarg;
|
||||
FLOAT *a;
|
||||
FLOAT alpha[2] = { ONE, ZERO};
|
||||
|
@ -96,8 +96,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
bk = n - i;
|
||||
if (bk > blocking) bk = blocking;
|
||||
|
||||
range_N[0] = i;
|
||||
range_N[1] = i + bk;
|
||||
/* range_N[0] = i;
|
||||
range_N[1] = i + bk; */
|
||||
|
||||
newarg.lda = lda;
|
||||
newarg.ldb = lda;
|
||||
|
|
Loading…
Reference in New Issue