Merge pull request #1623 from fenrus75/fast-thread
Initialize only the required subset of the jobs array, fix barriers and improve switch ratio on SkylakeX and Haswell. For issue #1622
This commit is contained in:
commit
5a6a2bed9a
|
@ -60,8 +60,13 @@
|
||||||
#endif
|
#endif
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define MB
|
#ifdef __GNUC__
|
||||||
#define WMB
|
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||||
|
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||||
|
#else
|
||||||
|
#define MB do {} while (0)
|
||||||
|
#define WMB do {} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
|
|
|
@ -91,11 +91,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
#if __STDC_VERSION__ >= 201112L
|
|
||||||
_Atomic
|
|
||||||
#else
|
|
||||||
volatile
|
volatile
|
||||||
#endif
|
|
||||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
@ -351,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
/* Make sure if no one is using workspace */
|
/* Make sure if no one is using workspace */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++)
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||||
STOP_RPCC(waiting1);
|
STOP_RPCC(waiting1);
|
||||||
|
|
||||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||||
|
@ -413,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Wait until other region of B is initialized */
|
/* Wait until other region of B is initialized */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||||
STOP_RPCC(waiting2);
|
STOP_RPCC(waiting2);
|
||||||
|
|
||||||
/* Apply kernel with local region of A and part of other region of B */
|
/* Apply kernel with local region of A and part of other region of B */
|
||||||
|
@ -431,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
/* Clear synchronization flag if this thread is done with other region of B */
|
/* Clear synchronization flag if this thread is done with other region of B */
|
||||||
if (m_to - m_from == min_i) {
|
if (m_to - m_from == min_i) {
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (current != mypos);
|
} while (current != mypos);
|
||||||
|
@ -492,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
for (i = 0; i < args -> nthreads; i++) {
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
for (js = 0; js < DIVIDE_RATE; js++) {
|
for (js = 0; js < DIVIDE_RATE; js++) {
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
STOP_RPCC(waiting3);
|
STOP_RPCC(waiting3);
|
||||||
|
@ -658,8 +655,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Clear synchronization flags */
|
/* Clear synchronization flags */
|
||||||
for (i = 0; i < MAX_CPU_NUMBER; i++) {
|
for (i = 0; i < nthreads; i++) {
|
||||||
for (j = 0; j < MAX_CPU_NUMBER; j++) {
|
for (j = 0; j < nthreads; j++) {
|
||||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||||
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
||||||
}
|
}
|
||||||
|
|
4
param.h
4
param.h
|
@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define SYMV_P 8
|
#define SYMV_P 8
|
||||||
|
|
||||||
#define SWITCH_RATIO 4
|
#define SWITCH_RATIO 32
|
||||||
|
|
||||||
#ifdef ARCH_X86
|
#ifdef ARCH_X86
|
||||||
|
|
||||||
|
@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define SYMV_P 8
|
#define SYMV_P 8
|
||||||
|
|
||||||
#define SWITCH_RATIO 4
|
#define SWITCH_RATIO 32
|
||||||
|
|
||||||
#ifdef ARCH_X86
|
#ifdef ARCH_X86
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue