redefined functions for TIMING and YIELDING for ARMV7 processor
This commit is contained in:
parent
e31186efd4
commit
5400a9f4e4
8
common.h
8
common.h
|
@ -310,10 +310,18 @@ typedef int blasint;
|
|||
#define YIELDING SwitchToThread()
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef ARMV7
|
||||
#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef YIELDING
|
||||
#define YIELDING sched_yield()
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/***
|
||||
To alloc job_t on heap or statck.
|
||||
please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
|
|
|
@ -104,11 +104,13 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
}
|
||||
|
||||
|
||||
static inline BLASULONG rpcc(void){
|
||||
BLASULONG ret=0;
|
||||
static inline unsigned long long rpcc(void){
|
||||
unsigned long long ret=0;
|
||||
double v;
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv,NULL);
|
||||
ret=1000000* tv.tv_sec + tv.tv_usec;
|
||||
v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
|
||||
ret = (unsigned long long) ( v * 1000.0d );
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -36,6 +36,8 @@
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
// #define TIMING 1
|
||||
|
||||
/* This file is a template for level 3 operation */
|
||||
|
||||
#ifndef BETA_OPERATION
|
||||
|
@ -341,8 +343,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
#else
|
||||
#elif defined(ARMV7)
|
||||
if (min_jj >= 32) min_jj = 32;
|
||||
else
|
||||
if (min_jj >= 16) min_jj = 16;
|
||||
else
|
||||
if (min_jj >= 8) min_jj = 8;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
#endif
|
||||
|
||||
|
@ -402,12 +412,22 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#ifdef TIMING
|
||||
total = (double)outercost + (double)innercost + (double)kernelcost;
|
||||
|
||||
#ifdef ARMV7
|
||||
|
||||
printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n",
|
||||
innercost / total * 100., outercost / total * 100.,
|
||||
kernelcost / total * 100.);
|
||||
|
||||
|
||||
#else
|
||||
|
||||
printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n",
|
||||
innercost / total * 100., outercost / total * 100.,
|
||||
kernelcost / total * 100.,
|
||||
(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2.,
|
||||
(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -36,6 +36,8 @@
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
// #define TIMING 1
|
||||
|
||||
#ifndef CACHE_LINE_SIZE
|
||||
#define CACHE_LINE_SIZE 8
|
||||
#endif
|
||||
|
@ -233,6 +235,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
BLASLONG l1stride, l2size;
|
||||
|
||||
#ifdef TIMING
|
||||
|
||||
#ifdef ARMV7
|
||||
|
||||
unsigned long long rpcc_counter;
|
||||
unsigned long long copy_A = 0;
|
||||
unsigned long long copy_B = 0;
|
||||
unsigned long long kernel = 0;
|
||||
unsigned long long waiting1 = 0;
|
||||
unsigned long long waiting2 = 0;
|
||||
unsigned long long waiting3 = 0;
|
||||
unsigned long long waiting6[MAX_CPU_NUMBER];
|
||||
unsigned long long ops = 0;
|
||||
|
||||
#else
|
||||
|
||||
BLASULONG rpcc_counter;
|
||||
BLASULONG copy_A = 0;
|
||||
BLASULONG copy_B = 0;
|
||||
|
@ -243,6 +260,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
BLASULONG waiting6[MAX_CPU_NUMBER];
|
||||
BLASULONG ops = 0;
|
||||
|
||||
#endif
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
|
||||
#endif
|
||||
|
||||
|
@ -320,15 +339,35 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
min_l = k - ls;
|
||||
|
||||
#ifdef ARMV7_1
|
||||
if (min_l >= GEMM_Q / 4 * 2) {
|
||||
min_l = GEMM_Q / 4;
|
||||
} else {
|
||||
if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2;
|
||||
}
|
||||
|
||||
#else
|
||||
if (min_l >= GEMM_Q * 2) {
|
||||
min_l = GEMM_Q;
|
||||
} else {
|
||||
if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
|
||||
}
|
||||
#endif
|
||||
|
||||
l1stride = 1;
|
||||
min_i = m_to - m_from;
|
||||
|
||||
#ifdef ARMV7_1
|
||||
if (min_i >= GEMM_P / 4 * 2) {
|
||||
min_i = GEMM_P / 4;
|
||||
} else {
|
||||
if (min_i > GEMM_P / 4) {
|
||||
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
|
||||
} else {
|
||||
if (args -> nthreads == 1) l1stride = 0;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (min_i >= GEMM_P * 2) {
|
||||
min_i = GEMM_P;
|
||||
} else {
|
||||
|
@ -339,6 +378,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
START_RPCC();
|
||||
|
||||
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
|
||||
|
@ -375,6 +416,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
#elif defined(ARMV7)
|
||||
if (min_jj >= 16) min_jj = 16;
|
||||
else
|
||||
if (min_jj >= 8) min_jj = 8;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
|
||||
#else
|
||||
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
@ -506,6 +555,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
STOP_RPCC(waiting3);
|
||||
|
||||
#ifdef TIMING
|
||||
|
||||
#ifdef ARMV7
|
||||
|
||||
unsigned long long waiting = waiting1 + waiting2 + waiting3;
|
||||
unsigned long long total = copy_A + copy_B + kernel + waiting;
|
||||
|
||||
fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
|
||||
mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
|
||||
(double)waiting1 /(double)total * 100.,
|
||||
(double)waiting2 /(double)total * 100.,
|
||||
(double)waiting3 /(double)total * 100.,
|
||||
(double)kernel /(double)total * 100.);
|
||||
|
||||
#else
|
||||
|
||||
BLASLONG waiting = waiting1 + waiting2 + waiting3;
|
||||
BLASLONG total = copy_A + copy_B + kernel + waiting;
|
||||
|
||||
|
@ -516,6 +580,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
(double)waiting3 /(double)total * 100.,
|
||||
(double)ops/(double)kernel / 4. * 100.);
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n",
|
||||
mypos, copy_A, copy_B, waiting);
|
||||
|
|
Loading…
Reference in New Issue