Merge pull request #1623 from fenrus75/fast-thread

Initialize only the required subset of the jobs array, fix barriers and improve switch ratio on SkylakeX and Haswell. For issue #1622
This commit is contained in:
Martin Kroeker 2018-06-18 09:02:40 +02:00 committed by GitHub
commit 5a6a2bed9a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 15 additions and 13 deletions

View File

@ -60,8 +60,13 @@
#endif
*/
#define MB
#define WMB
#ifdef __GNUC__
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif
static void __inline blas_lock(volatile BLASULONG *address){

View File

@ -91,11 +91,7 @@
#endif
typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
@ -351,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING)
@ -413,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */
START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
/* Apply kernel with local region of A and part of other region of B */
@ -431,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
} while (current != mypos);
@ -492,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
}
}
STOP_RPCC(waiting3);
@ -658,8 +655,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
}
/* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) {
for (i = 0; i < nthreads; i++) {
for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
}

View File

@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8
#define SWITCH_RATIO 4
#define SWITCH_RATIO 32
#ifdef ARCH_X86
@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8
#define SWITCH_RATIO 4
#define SWITCH_RATIO 32
#ifdef ARCH_X86