diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c
index a7c28f4c2..c38a2632d 100644
--- a/lapack/potrf/potrf_parallel.c
+++ b/lapack/potrf/potrf_parallel.c
@@ -105,6 +105,14 @@ typedef struct {
   BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
 } job_t;
 
+#ifdef HAVE_C11
+#define atomic_load_long(p)             __atomic_load_n(p, __ATOMIC_RELAXED)
+#define atomic_store_long(p, v)         __atomic_store_n(p, v, __ATOMIC_RELAXED)
+#else
+#define atomic_load_long(p)             (BLASLONG)(*(volatile BLASLONG*)(p))
+#define atomic_store_long(p, v)         (*(volatile BLASLONG *)(p)) = (v)
+#endif
+
 
 #ifndef KERNEL_OPERATION
 #ifndef COMPLEX
@@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
     }
 
 #ifndef LOWER
+    MB;
     for (i = 0; i <= mypos; i++)
-      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
+      atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
+    //  job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
 #else
+    MB
     for (i = mypos; i < args -> nthreads; i++)
-      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
+      atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
+//      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
 #endif
 
-    WMB;
+//    WMB;
   }
 
   min_i = m_to - m_from;
@@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
 
 	/* thread has to wait */
-	if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
+	if (current != mypos) 
+	        do {
+           jw =  atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
+        } while (jw == 0); 
+        MB;
+
+	//while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
 
 	KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
 			 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
 			 c, lda, m_from, xxx);
 
 	if (m_from + min_i >= m_to) {
-	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+      atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
+//	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
 	  WMB;
 	}
       }
@@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 			     c, lda, is, xxx);
 
 	    if (is + min_i >= m_to) {
-	      job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+      atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
+//	      job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
 	      WMB;
 	    }
 	  }
@@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
   for (i = 0; i < args -> nthreads; i++) {
     if (i != mypos) {
-      for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
+      for (xxx = 0; xxx < DIVIDE_RATE; xxx++)
+      #if 1
+    {
+        do {
+           jw =  atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * xxx]);
+        } while (jw);
+        MB;
+    }
+#else
 	while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
-      }
+#endif
+    //  }
     }
   }