Merge pull request #1623 from fenrus75/fast-thread
Initialize only the required subset of the jobs array, fix barriers and improve switch ratio on SkylakeX and Haswell. For issue #1622
This commit is contained in:
commit
5a6a2bed9a
|
@ -60,8 +60,13 @@
|
|||
#endif
|
||||
*/
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
#ifdef __GNUC__
|
||||
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#else
|
||||
#define MB do {} while (0)
|
||||
#define WMB do {} while (0)
|
||||
#endif
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
|
|
|
@ -91,11 +91,7 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
@ -351,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
/* Make sure if no one is using workspace */
|
||||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||
|
@ -413,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Wait until other region of B is initialized */
|
||||
START_RPCC();
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
/* Apply kernel with local region of A and part of other region of B */
|
||||
|
@ -431,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
/* Clear synchronization flag if this thread is done with other region of B */
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
@ -492,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (js = 0; js < DIVIDE_RATE; js++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
|
||||
}
|
||||
}
|
||||
STOP_RPCC(waiting3);
|
||||
|
@ -658,8 +655,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
}
|
||||
|
||||
/* Clear synchronization flags */
|
||||
for (i = 0; i < MAX_CPU_NUMBER; i++) {
|
||||
for (j = 0; j < MAX_CPU_NUMBER; j++) {
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
for (j = 0; j < nthreads; j++) {
|
||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
||||
}
|
||||
|
|
4
param.h
4
param.h
|
@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define SYMV_P 8
|
||||
|
||||
#define SWITCH_RATIO 4
|
||||
#define SWITCH_RATIO 32
|
||||
|
||||
#ifdef ARCH_X86
|
||||
|
||||
|
@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define SYMV_P 8
|
||||
|
||||
#define SWITCH_RATIO 4
|
||||
#define SWITCH_RATIO 32
|
||||
|
||||
#ifdef ARCH_X86
|
||||
|
||||
|
|
Loading…
Reference in New Issue