From fcb77ab129821690fac4e532640c5cfa786c3a79 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 14 Jun 2018 16:57:58 +0200
Subject: [PATCH 01/24] Update OSX deployment target to 10.8

fixes #1580
---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 62ba0e466..5dffd8d2e 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -248,7 +248,7 @@ endif
 
 ifeq ($(OSNAME), Darwin)
 ifndef MACOSX_DEPLOYMENT_TARGET
-export MACOSX_DEPLOYMENT_TARGET=10.6
+export MACOSX_DEPLOYMENT_TARGET=10.8
 endif
 MD5SUM = md5 -r
 endif

From bf40f806efa55c7a7c7ec57535919598eaeb569d Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Thu, 14 Jun 2018 12:18:04 +0100
Subject: [PATCH 02/24] Remove the need for most locking in memory.c.

Using thread local storage for tracking memory allocations means that threads
no longer have to lock at all when doing memory allocations / frees. This
particularly helps the gemm driver since it does an allocation per invocation.
Even without threading at all, this helps, since even calling a lock with
no contention has a cost:

Before this change, no threading:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          102 ns        102 ns   13504412
BM_SGEMM/6          175 ns        175 ns    7997580
BM_SGEMM/8          205 ns        205 ns    6842073
BM_SGEMM/10         266 ns        266 ns    5294919
BM_SGEMM/16         478 ns        478 ns    2963441
BM_SGEMM/20         690 ns        690 ns    2144755
BM_SGEMM/32        1906 ns       1906 ns     716981
BM_SGEMM/40        2983 ns       2983 ns     473218
BM_SGEMM/64        9421 ns       9422 ns     148450
BM_SGEMM/72       12630 ns      12631 ns     112105
BM_SGEMM/80       15845 ns      15846 ns      89118
BM_SGEMM/90       25675 ns      25676 ns      54332
BM_SGEMM/100      29864 ns      29865 ns      47120
BM_SGEMM/112      37841 ns      37842 ns      36717
BM_SGEMM/128      56531 ns      56532 ns      25361
BM_SGEMM/140      75886 ns      75888 ns      18143
BM_SGEMM/150      98493 ns      98496 ns      14299
BM_SGEMM/160     102620 ns     102622 ns      13381
BM_SGEMM/170     135169 ns     135173 ns      10231
BM_SGEMM/180     146170 ns     146172 ns       9535
BM_SGEMM/189     190226 ns     190231 ns       7397
BM_SGEMM/200     194513 ns     194519 ns       7210
BM_SGEMM/256     396561 ns     396573 ns       3531
```
with this change:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   14500387
BM_SGEMM/6          166 ns        166 ns    8381763
BM_SGEMM/8          196 ns        196 ns    7277044
BM_SGEMM/10         256 ns        256 ns    5515721
BM_SGEMM/16         463 ns        463 ns    3025197
BM_SGEMM/20         636 ns        636 ns    2070213
BM_SGEMM/32        1885 ns       1885 ns     739444
BM_SGEMM/40        2969 ns       2969 ns     472152
BM_SGEMM/64        9371 ns       9372 ns     148932
BM_SGEMM/72       12431 ns      12431 ns     112919
BM_SGEMM/80       15615 ns      15616 ns      89978
BM_SGEMM/90       25397 ns      25398 ns      55041
BM_SGEMM/100      29445 ns      29446 ns      47540
BM_SGEMM/112      37530 ns      37531 ns      37286
BM_SGEMM/128      55373 ns      55375 ns      25277
BM_SGEMM/140      76241 ns      76241 ns      18259
BM_SGEMM/150     102196 ns     102200 ns      13736
BM_SGEMM/160     101521 ns     101525 ns      13556
BM_SGEMM/170     136182 ns     136184 ns      10567
BM_SGEMM/180     146861 ns     146864 ns       9035
BM_SGEMM/189     192632 ns     192632 ns       7231
BM_SGEMM/200     198547 ns     198555 ns       6995
BM_SGEMM/256     392316 ns     392330 ns       3539
```

Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost
of small matrix operations was overshadowed by thread locking (look smaller than
32) even when not explicitly spawning threads:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          328 ns        328 ns    4170562
BM_SGEMM/6          396 ns        396 ns    3536400
BM_SGEMM/8          418 ns        418 ns    3330102
BM_SGEMM/10         491 ns        491 ns    2863047
BM_SGEMM/16         710 ns        710 ns    2028314
BM_SGEMM/20         871 ns        871 ns    1581546
BM_SGEMM/32        2132 ns       2132 ns     657089
BM_SGEMM/40        3197 ns       3196 ns     437969
BM_SGEMM/64        9645 ns       9645 ns     144987
BM_SGEMM/72       35064 ns      32881 ns      50264
BM_SGEMM/80       37661 ns      35787 ns      42080
BM_SGEMM/90       36507 ns      36077 ns      40091
BM_SGEMM/100      32513 ns      31850 ns      48607
BM_SGEMM/112      41742 ns      41207 ns      37273
BM_SGEMM/128      67211 ns      65095 ns      21933
BM_SGEMM/140      68263 ns      67943 ns      19245
BM_SGEMM/150     121854 ns     115439 ns      10660
BM_SGEMM/160     116826 ns     115539 ns      10000
BM_SGEMM/170     126566 ns     122798 ns      11960
BM_SGEMM/180     130088 ns     127292 ns      11503
BM_SGEMM/189     120309 ns     116634 ns      13162
BM_SGEMM/200     114559 ns     110993 ns      10000
BM_SGEMM/256     217063 ns     207806 ns       6417
```
and after, it's gone (note this includes my other change which reduces calls
to num_cpu_avail):
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   12347650
BM_SGEMM/6          166 ns        166 ns    8259683
BM_SGEMM/8          193 ns        193 ns    7162210
BM_SGEMM/10         258 ns        258 ns    5415657
BM_SGEMM/16         471 ns        471 ns    2981009
BM_SGEMM/20         666 ns        666 ns    2148002
BM_SGEMM/32        1903 ns       1903 ns     738245
BM_SGEMM/40        2969 ns       2969 ns     473239
BM_SGEMM/64        9440 ns       9440 ns     148442
BM_SGEMM/72       37239 ns      33330 ns      46813
BM_SGEMM/80       57350 ns      55949 ns      32251
BM_SGEMM/90       36275 ns      36249 ns      42259
BM_SGEMM/100      31111 ns      31008 ns      45270
BM_SGEMM/112      43782 ns      40912 ns      34749
BM_SGEMM/128      67375 ns      64406 ns      22443
BM_SGEMM/140      76389 ns      67003 ns      21430
BM_SGEMM/150      72952 ns      71830 ns      19793
BM_SGEMM/160      97039 ns      96858 ns      11498
BM_SGEMM/170     123272 ns     122007 ns      11855
BM_SGEMM/180     126828 ns     126505 ns      11567
BM_SGEMM/189     115179 ns     114665 ns      11044
BM_SGEMM/200      89289 ns      87259 ns      16147
BM_SGEMM/256     226252 ns     222677 ns       7375
```

I've also tested this with ThreadSanitizer and found no data races during
execution.  I'm not sure why 200 is always faster than it's neighbors, we must
be hitting some optimal cache size or something.
---
 driver/others/memory.c | 199 +++++++++--------------------------------
 1 file changed, 43 insertions(+), 156 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index d69e52e97..85f790615 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -13,9 +13,9 @@ met:
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.
-   3. Neither the name of the OpenBLAS project nor the names of 
-      its contributors may be used to endorse or promote products 
-      derived from this software without specific prior written 
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FIXED_PAGESIZE 4096
 #endif
 
+#ifndef BUFFERS_PER_THREAD
+#ifdef USE_OPENMP
+#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
+#else
+#define BUFFERS_PER_THREAD NUM_BUFFERS
+#endif
+#endif
+
 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
 
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -213,7 +221,7 @@ int i,n;
   ret = sched_getaffinity(0,size,cpusetp);
   if (ret!=0) return nums;
   ret = CPU_COUNT_S(size,cpusetp);
-  if (ret > 0 && ret < nums) nums = ret;	
+  if (ret > 0 && ret < nums) nums = ret;
   CPU_FREE(cpusetp);
   return nums;
  #endif
@@ -415,8 +423,15 @@ struct release_t {
 
 int hugetlb_allocated = 0;
 
-static struct release_t release_info[NUM_BUFFERS];
-static int release_pos = 0;
+#if defined(OS_WINDOWS)
+#define THREAD_LOCAL __declspec(thread)
+#define UNLIKELY_TO_BE_ZERO(x) (x)
+#else
+#define THREAD_LOCAL __thread
+#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
+#endif
+static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
+static int THREAD_LOCAL release_pos = 0;
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
 static int hot_alloc = 0;
@@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){
   }
 
   if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
-    LOCK_COMMAND(&alloc_lock);
-#endif    
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
-    UNLOCK_COMMAND(&alloc_lock);
-#endif    
   }
 
 #ifdef OS_LINUX
@@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){
 #endif
 
   if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
-    LOCK_COMMAND(&alloc_lock);
-#endif
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
-    UNLOCK_COMMAND(&alloc_lock);
-#endif
   }
 
   return map_address;
@@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){
 
   tp.PrivilegeCount = 1;
   tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-  
+
   if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
       CloseHandle(hToken);
       return (void*)-1;
@@ -961,20 +964,17 @@ static BLASULONG base_address      = 0UL;
 static BLASULONG base_address      = BASE_ADDRESS;
 #endif
 
-static volatile struct {
-  BLASULONG lock;
+struct memory_t {
   void *addr;
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-  int   pos;
-#endif
   int used;
 #ifndef __64BIT__
   char dummy[48];
 #else
   char dummy[40];
 #endif
+};
 
-} memory[NUM_BUFFERS];
+static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
 
 static int memory_initialized = 0;
 
@@ -987,9 +987,6 @@ static int memory_initialized = 0;
 void *blas_memory_alloc(int procpos){
 
   int position;
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-  int mypos;
-#endif
 
   void *map_address;
 
@@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){
   };
   void *(**func)(void *address);
 
-#if defined(USE_OPENMP)
-  if (!memory_initialized) {
-#endif
+  if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
 
-  LOCK_COMMAND(&alloc_lock);
+    /* Only allow a single thread to initialize memory system */
+    LOCK_COMMAND(&alloc_lock);
 
-  if (!memory_initialized) {
-
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-    for (position = 0; position < NUM_BUFFERS; position ++){
-      memory[position].addr   = (void *)0;
-      memory[position].pos    = -1;
-      memory[position].used   = 0;
-      memory[position].lock   = 0;
-    }
-#endif
+    if (!memory_initialized) {
 
 #ifdef DYNAMIC_ARCH
-    gotoblas_dynamic_init();
+      gotoblas_dynamic_init();
 #endif
 
 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
-    gotoblas_affinity_init();
+      gotoblas_affinity_init();
 #endif
 
 #ifdef SMP
-    if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
+      if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
 #endif
 
 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
 #ifndef DYNAMIC_ARCH
-    blas_set_parameter();
+      blas_set_parameter();
 #endif
 #endif
 
-    memory_initialized = 1;
+      memory_initialized = 1;
 
+    }
+    UNLOCK_COMMAND(&alloc_lock);
   }
-  UNLOCK_COMMAND(&alloc_lock);
-#if defined(USE_OPENMP)
-  }
-#endif
 
 #ifdef DEBUG
   printf("Alloc Start ...\n");
-#endif
-
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-
-  mypos = WhereAmI();
-
-  position = mypos;
-  while (position >= NUM_BUFFERS) position >>= 1;
-
-  do {
-    if (!memory[position].used && (memory[position].pos == mypos)) {
-#if defined(SMP) && !defined(USE_OPENMP)
-      LOCK_COMMAND(&alloc_lock);
-#else      
-      blas_lock(&memory[position].lock);
-#endif
-      if (!memory[position].used) goto allocation;
-#if defined(SMP) && !defined(USE_OPENMP)
-      UNLOCK_COMMAND(&alloc_lock);
-#else
-      blas_unlock(&memory[position].lock);
-#endif      
-    }
-
-    position ++;
-
-  } while (position < NUM_BUFFERS);
-
-
 #endif
 
   position = 0;
 
   do {
-#if defined(SMP) && !defined(USE_OPENMP)
-      LOCK_COMMAND(&alloc_lock);
-#else
-    if (!memory[position].used) { 
-      blas_lock(&memory[position].lock);
-#endif
       if (!memory[position].used) goto allocation;
-#if defined(SMP) && !defined(USE_OPENMP)
-      UNLOCK_COMMAND(&alloc_lock);
-#else      
-      blas_unlock(&memory[position].lock);
-      }
-#endif
-
     position ++;
 
-  } while (position < NUM_BUFFERS);
+  } while (position < BUFFERS_PER_THREAD);
 
   goto error;
 
@@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){
 #endif
 
   memory[position].used = 1;
-#if defined(SMP) && !defined(USE_OPENMP)
-  UNLOCK_COMMAND(&alloc_lock);
-#else
-  blas_unlock(&memory[position].lock);
-#endif
 
   if (!memory[position].addr) {
     do {
@@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){
 
 #ifdef ALLOC_DEVICEDRIVER
 	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
-	    fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
+	    fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
 	}
 #endif
 
 #ifdef ALLOC_HUGETLBFILE
 	if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
 #ifndef OS_WINDOWS
-	    fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
+	    fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
 #endif
 	}
 #endif
@@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){
 
     } while ((BLASLONG)map_address == -1);
 
-#if defined(SMP) && !defined(USE_OPENMP)
-    LOCK_COMMAND(&alloc_lock);
-#endif    
     memory[position].addr = map_address;
-#if defined(SMP) && !defined(USE_OPENMP)
-    UNLOCK_COMMAND(&alloc_lock);
-#endif
 
 #ifdef DEBUG
     printf("  Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
 #endif
   }
 
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-
-  if (memory[position].pos == -1) memory[position].pos = mypos;
-
-#endif
-
-#ifdef DYNAMIC_ARCH
-
-  if (memory_initialized == 1) {
-
-    LOCK_COMMAND(&alloc_lock);
-
-    if (memory_initialized == 1) {
-
-      if (!gotoblas) gotoblas_dynamic_init();
-
-      memory_initialized = 2;
-    }
-
-    UNLOCK_COMMAND(&alloc_lock);
-
-  }
-#endif
-
-
 #ifdef DEBUG
   printf("Mapped   : %p  %3d\n\n",
 	  (void *)memory[position].addr, position);
@@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){
   return (void *)memory[position].addr;
 
  error:
-  printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
+  printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
 
   return NULL;
 }
@@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){
 #endif
 
   position = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
-  LOCK_COMMAND(&alloc_lock);
-#endif
-  while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
+  while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
     position++;
 
   if (memory[position].addr != free_area) goto error;
@@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){
   printf("  Position : %d\n", position);
 #endif
 
-  // arm: ensure all writes are finished before other thread takes this memory
-  WMB;
-
   memory[position].used = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
-  UNLOCK_COMMAND(&alloc_lock);
-#endif
 
 #ifdef DEBUG
   printf("Unmap Succeeded.\n\n");
@@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){
   printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);
 
 #ifdef DEBUG
-  for (position = 0; position < NUM_BUFFERS; position++)
+  for (position = 0; position < BUFFERS_PER_THREAD; position++)
     printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
-#endif
-#if defined(SMP) && !defined(USE_OPENMP)
-  UNLOCK_COMMAND(&alloc_lock);
 #endif
   return;
 }
@@ -1293,8 +1188,6 @@ void blas_shutdown(void){
   BLASFUNC(blas_thread_shutdown)();
 #endif
 
-  LOCK_COMMAND(&alloc_lock);
-
   for (pos = 0; pos < release_pos; pos ++) {
     release_info[pos].func(&release_info[pos]);
   }
@@ -1305,17 +1198,11 @@ void blas_shutdown(void){
   base_address      = BASE_ADDRESS;
 #endif
 
-  for (pos = 0; pos < NUM_BUFFERS; pos ++){
+  for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
     memory[pos].addr   = (void *)0;
     memory[pos].used   = 0;
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-    memory[pos].pos    = -1;
-#endif
-    memory[pos].lock   = 0;
   }
 
-  UNLOCK_COMMAND(&alloc_lock);
-
   return;
 }
 

From 47bf0dba8f7a9cbd559e2f9cabe0bf2c7d3ee7a8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Jun 2018 11:25:05 +0200
Subject: [PATCH 03/24] Add build-time option for OMP scheduler; document
 MULTITHREAD_THRESHOLD range (#1620)

* Allow choosing the OpenMP scheduler and add range hint for GEMM_MULTITHREAD_THRESHOLD
* Amended description of GEMM_MULTITHREAD_THRESHOLD
to reflect #742 making it track floating point operations rather than matrix size
---
 Makefile.rule                   | 15 +++++++++++++--
 driver/others/blas_server_omp.c |  6 +++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/Makefile.rule b/Makefile.rule
index 5c03d0195..649aabe70 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
 # This flag is always set for POWER8. Don't modify the flag 
 # USE_OPENMP = 1
 
+# The OpenMP scheduler to use - by default this is "static" and you
+# will normally not want to change this unless you know that your main
+# workload will involve tasks that have highly unbalanced running times
+# for individual threads. Changing away from "static" may also adversely
+# affect memory access locality in NUMA systems. Setting to "runtime" will
+# allow you to select the scheduler from the environment variable OMP_SCHEDULE
+# CCOMMON_OPT += -DOMP_SCHED=dynamic
+
 # You can define maximum number of threads. Basically it should be
 # less than actual number of cores. If you don't specify one, it's
 # automatically detected by the the script.
@@ -156,8 +164,11 @@ NO_AFFINITY = 1
 # CONSISTENT_FPCSR = 1
 
 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
-# with single thread. You can use this flag to avoid the overhead of multi-threading
-# in small matrix sizes. The default value is 4.
+# with single thread. (Actually in recent versions this is a factor proportional to the
+# number of floating point operations necessary for the given problem size, no longer
+# an individual dimension). You can use this setting to avoid the overhead of multi-
+# threading in small matrix sizes. The default value is 4, but values as high as 50 have 
+# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
 # GEMM_MULTITHREAD_THRESHOLD = 4
 
 # If you need santy check by comparing reference BLAS. It'll be very
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index fccdb4320..4255852c8 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -48,6 +48,10 @@
 
 #else
 
+#ifndef OMP_SCHED
+#define OMP_SCHED static
+#endif
+
 int blas_server_avail = 0;
 
 static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
       break;
   }
 
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(OMP_SCHED)
   for (i = 0; i < num; i ++) {
 
 #ifndef USE_SIMPLE_THREADED_LEVEL3

From 9e162146a93a58a06515bc53f07e37b8924e0d67 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 15:32:03 +0000
Subject: [PATCH 04/24] Only initialize the part of the jobs array that will
 get used

The jobs array is getting initialized in O(compiled cpus^2) complexity.
Distros and people with bigger systems will use pretty high values
(128 or 256 or more) for this value, leading to interesting bubbles
in performance.

Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle
in the interesting range (threading kicks in at 65x65 mult by 65x65).
The hardware is capable of 32 multiplications per cycle theoretically.

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10703.9   10.6       0.0%                             17990.6      6.3       0.0%
  64 x 64               20778.4   12.8       0.0%                             40629.2      6.5       0.0%
  65 x 65               26869.9   10.3       0.0%                             52545.7      5.3       0.0%
  80 x 80               38104.5   13.5       0.0%                             72492.7      7.1       0.0%
  96 x 96               61626.4   14.4       0.0%                            113983.8      7.8       0.0%
 112 x 112              91803.8   15.3       0.0%                            180987.3      7.8       0.0%
 128 x 128             133161.4   15.8       0.0%                            258374.3      8.1       0.0%

When threading is turned on
TARGET=SKYLAKEX F_COMPILER=GFORTRAN  SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0  NUM_THREADS=128

  Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10725.9   10.5      -0.2%                             18134.9      6.2      -0.8%
  64 x 64               20500.6   12.9       1.3%                             40929.1      6.5      -0.7%
  65 x 65             2040832.1    0.1   -7495.2%                           2097633.6      0.1   -3892.0%
  80 x 80             2063129.1    0.2   -5314.4%                           2119925.2      0.2   -2824.3%
  96 x 96             2070374.5    0.4   -3259.6%                           2173604.4      0.4   -1806.9%
 112 x 112            2111721.5    0.7   -2169.6%                           2263330.8      0.6   -1170.0%
 128 x 128            2276181.5    0.9   -1609.3%                           2377228.9      0.9    -820.1%

There is a deep deep cliff once you hit 65x65

With this patch

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10630.0   10.6       0.7%                             18112.8      6.2      -0.7%
  64 x 64               20374.8   13.0       1.9%                             40487.0      6.5       0.4%
  65 x 65              141955.2    1.9    -428.3%                            146708.8      1.9    -179.2%
  80 x 80              178921.1    2.9    -369.6%                            186032.7      2.8    -156.6%
  96 x 96              205436.2    4.3    -233.4%                            224513.1      3.9     -97.0%
 112 x 112             244408.2    5.8    -162.7%                            262158.7      5.4     -47.1%
 128 x 128             321334.5    6.5    -141.3%                            333829.0      6.3     -29.2%

The cliff is very significantly reduced.
(more to follow)
---
 driver/level3/level3_thread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 4ab1ee8cc..018813b8c 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -658,8 +658,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
     }
 
     /* Clear synchronization flags */
-    for (i = 0; i < MAX_CPU_NUMBER; i++) {
-      for (j = 0; j < MAX_CPU_NUMBER; j++) {
+    for (i = 0; i < nthreads; i++) {
+      for (j = 0; j < nthreads; j++) {
 	for (k = 0; k < DIVIDE_RATE; k++) {
 	  job[i].working[j][CACHE_LINE_SIZE * k] = 0;
 	}

From d148ec4ea18e672dacb1270d4a5308ccaaae18bc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 15:39:15 +0000
Subject: [PATCH 05/24] Don't use _Atomic for jobs sometimes...

The use of _Atomic leads to really bad code generation in the compiler
(on x86, you get 2 "mfence" memory barriers around each access with gcc8, despite
x86 being ordered and cache coherent). But there's a fallback in the code that
just uses volatile which is more than plenty in practice.

If we're nervous about cross thread synchronization for these variables, we should
make the YIELD function be a compiler/memory barrier instead.

performance before (after last commit)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10630.0   10.6       0.7%                             18112.8      6.2      -0.7%
  64 x 64               20374.8   13.0       1.9%                             40487.0      6.5       0.4%
  65 x 65              141955.2    1.9    -428.3%                            146708.8      1.9    -179.2%
  80 x 80              178921.1    2.9    -369.6%                            186032.7      2.8    -156.6%
  96 x 96              205436.2    4.3    -233.4%                            224513.1      3.9     -97.0%
 112 x 112             244408.2    5.8    -162.7%                            262158.7      5.4     -47.1%
 128 x 128             321334.5    6.5    -141.3%                            333829.0      6.3     -29.2%

Performance with this patch (roughly a 2x improvement):

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10756.0   10.5      -0.5%                             18296.7      6.1      -1.7%
  64 x 64               20490.0   12.9       1.4%                             40615.0      6.5       0.0%
  65 x 65               83528.3    3.3    -210.9%                             96319.0      2.9     -83.3%
  80 x 80              101453.5    5.1    -166.3%                            128021.7      4.0     -76.6%
  96 x 96              149795.1    5.9    -143.1%                            168059.4      5.3     -47.4%
 112 x 112             191481.2    7.3    -105.8%                            204165.0      6.9     -14.6%
 128 x 128             265019.2    7.9     -99.0%                            272006.4      7.7      -5.3%
---
 driver/level3/level3_thread.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 018813b8c..7e75f69d1 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -91,11 +91,7 @@
 #endif
 
 typedef struct {
-#if __STDC_VERSION__ >= 201112L
-_Atomic
-#else  
   volatile
-#endif
    BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
 } job_t;
 

From 5c6f008365ee3c6d42f8630d27259f130a688468 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 15:47:50 +0000
Subject: [PATCH 06/24] Tune param.h for SkylakeX

param.h defines a per-platform SWITCH_RATIO, which is used as a measure for how fine
grained the blocks for gemm need to be split up. Many platforms define this to 4.

The reality is that the gemm low level implementation for SkylakeX likes bigger blocks
due to the nature of SIMD... by tuning the SWITCH_RATIO to 32 the threading performance
improves significantly:

Before
   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10756.0   10.5      -0.5%                             18296.7      6.1      -1.7%
  64 x 64               20490.0   12.9       1.4%                             40615.0      6.5       0.0%
  65 x 65               83528.3    3.3    -210.9%                             96319.0      2.9     -83.3%
  80 x 80              101453.5    5.1    -166.3%                            128021.7      4.0     -76.6%
  96 x 96              149795.1    5.9    -143.1%                            168059.4      5.3     -47.4%
 112 x 112             191481.2    7.3    -105.8%                            204165.0      6.9     -14.6%
 128 x 128             265019.2    7.9     -99.0%                            272006.4      7.7      -5.3%

After
   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10666.3   10.6       0.4%                             18236.9      6.2      -1.4%
  64 x 64               20410.1   13.0       1.8%                             39925.8      6.6       1.7%
  65 x 65               34983.0    7.9     -30.2%                             51494.6      5.4       2.0%
  80 x 80               39769.1   13.0      -4.4%                             63805.2      8.1      12.0%
  96 x 96               45169.6   19.7      26.7%                             80065.8     11.1      29.8%
 112 x 112              57026.1   24.7      38.7%                             99535.5     14.2      44.1%
 128 x 128              64789.8   32.5      51.3%                            117407.2     17.9      54.6%

With this change, threading starts to be a win already at 96x96
---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 49a5e85e8..3573fffbb 100644
--- a/param.h
+++ b/param.h
@@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P  8
 
-#define SWITCH_RATIO	4
+#define SWITCH_RATIO	32
 
 #ifdef ARCH_X86
 

From 6eb4b9ae7c7cc58af00ac21b52fed8810d7e5710 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 17:05:04 +0000
Subject: [PATCH 07/24] Tune HASWELL SWITCH_RATIO as well

Similar to the SKYLAKEX patch, 32 seems to work best
(much better than 4 or 16)

Before (4)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15554.3    7.2       0.2%                             30353.8      3.7       0.3%
  64 x 64               30346.8    8.7       1.6%                             63495.0      4.1      -0.1%
  65 x 65               81668.1    3.4    -123.3%                             82705.2      3.3     -21.2%
  80 x 80              105045.9    4.9     -95.5%                            115226.0      4.5      -2.2%
  96 x 96              152461.2    5.8     -74.3%                            148156.3      6.0      16.4%
 112 x 112             188505.2    7.5     -42.2%                            171187.3      8.2      36.4%
 128 x 128             257884.0    8.1     -39.5%                            224764.8      9.3      46.0%

Intermediate (16)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15565.7    7.2       0.2%                             30378.9      3.7       0.2%
  64 x 64               30430.2    8.7       1.3%                             63046.4      4.2       0.6%
  65 x 65               27306.0   10.1      25.3%                             38879.2      7.1      43.0%
  80 x 80               51008.7   10.1       5.1%                             61007.6      8.4      45.9%
  96 x 96               70856.7   12.5      19.0%                             83403.1     10.6      53.0%
 112 x 112              84769.9   16.6      36.0%                             99920.1     14.1      62.9%
 128 x 128              84213.2   25.0      54.5%                            113024.2     18.6      72.8%

After (32)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15537.3    7.2       0.3%                             30537.0      3.6      -0.3%
  64 x 64               30352.7    8.7       1.6%                             62597.8      4.2       1.3%
  65 x 65               36857.0    7.5      -0.8%                             56167.6      4.9      17.7%
  80 x 80               42552.6   12.1      20.8%                             69536.7      7.4      38.3%
  96 x 96               52101.5   17.1      40.5%                             91016.1      9.7      48.7%
 112 x 112              63853.7   22.1      51.8%                            110507.4     12.7      58.9%
 128 x 128              73966.1   28.4      60.0%                            163146.4     12.9      60.8%
---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 3573fffbb..cfa4bba5c 100644
--- a/param.h
+++ b/param.h
@@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P  8
 
-#define SWITCH_RATIO	4
+#define SWITCH_RATIO	32
 
 #ifdef ARCH_X86
 

From 73de17664dfdf2934a2fdc6dd9442107e6c85035 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 17:50:43 +0000
Subject: [PATCH 08/24] Add missing barriers in gemm scheduler

a few places in the gemm scheduler code were missing barriers;
the code likely worked OK due to heavy use of volatile / _Atomic
but there's no reason to get this incorrect
---
 driver/level3/level3_thread.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 7e75f69d1..aeb5e6ed4 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -347,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       /* Make sure if no one is using workspace */
       START_RPCC();
       for (i = 0; i < args -> nthreads; i++)
-	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
+	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
       STOP_RPCC(waiting1);
 
 #if defined(FUSED_GEMM) && !defined(TIMING)
@@ -409,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
 	  /* Wait until other region of B is initialized */
 	  START_RPCC();
-	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
+	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
 	  STOP_RPCC(waiting2);
 
           /* Apply kernel with local region of A and part of other region of B */
@@ -427,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
         /* Clear synchronization flag if this thread is done with other region of B */
 	if (m_to - m_from == min_i) {
 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+	  WMB;
 	}
       }
     } while (current != mypos);
@@ -488,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
   START_RPCC();
   for (i = 0; i < args -> nthreads; i++) {
     for (js = 0; js < DIVIDE_RATE; js++) {
-      while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
+      while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
     }
   }
   STOP_RPCC(waiting3);

From 7e39ffe1135ee6ca1dc119f6eea9566668fd0916 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 17:53:15 +0000
Subject: [PATCH 09/24] On x86-64, make MB/WMB compiler barriers

Whie on x86(64) one does not normally need full memory barriers, it's
good practice to at least use compiler barriers for places where on other
architectures memory barriers are used; this prevents the compiler
from over-optimizing.
---
 common_x86_64.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/common_x86_64.h b/common_x86_64.h
index 7461aaf60..3236778b8 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -60,8 +60,13 @@
 #endif
 */
 
+#ifdef __GNUC__
+#define MB __asm__ __volatile__("": : :"memory")
+#define WMB __asm__ __volatile__("": : :"memory")
+#else
 #define MB
 #define WMB
+#endif
 
 static void __inline blas_lock(volatile BLASULONG *address){
 

From 2ddc96c9e5a86e3fd12954b3efc269f0cc8d07d8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 18:06:24 +0000
Subject: [PATCH 10/24] make WMB / MB safer on x86-64

make it so that

if (foo)
	RMB;
else
	MB;

is always done correctly and without syntax surprises
---
 common_x86_64.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common_x86_64.h b/common_x86_64.h
index 3236778b8..62e138e34 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -61,11 +61,11 @@
 */
 
 #ifdef __GNUC__
-#define MB __asm__ __volatile__("": : :"memory")
-#define WMB __asm__ __volatile__("": : :"memory")
+#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
 #else
-#define MB
-#define WMB
+#define MB do {} while (0)
+#define WMB do {} while (0)
 #endif
 
 static void __inline blas_lock(volatile BLASULONG *address){

From 2d8cc7193ace18c28ea05ef39e13bb28437b6d89 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Jun 2018 23:38:14 +0200
Subject: [PATCH 11/24] Support upcoming Intel Cannon Lake CPUs as Skylake X
 (#1621)

* Support  upcoming Cannon Lake as Skylake X
---
 cpuid_x86.c             | 17 +++++++++++++++++
 driver/others/dynamic.c | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index fc937865c..89eb809b0 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1339,6 +1339,23 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
 	}
 	break;
+      case 6:
+        switch (model) {
+        case 6: // Cannon Lake
+#ifndef NO_AVX512
+	  return CPUTYPE_SKYLAKEX;
+#else
+	  if(support_avx())
+#ifndef NO_AVX2
+	  return CPUTYPE_HASWELL;
+#else
+	  return CPUTYPE_SANDYBRIDGE;
+#endif
+	  else
+	  return CPUTYPE_NEHALEM;
+#endif			
+        }
+      break;  
       case 9:
       case 8: 
         switch (model) {
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 4271c0a0d..bacd3b7fa 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){
 	  return &gotoblas_NEHALEM;
 	}	
 	return NULL;
+      case 6:
+        if (model == 6) {
+          // Cannon Lake
+#ifndef NO_AVX512
+	  return &gotoblas_SKYLAKEX;
+#else
+	  if(support_avx())
+#ifndef NO_AVX2
+	  return &gotoblas_HASWELL;
+#else
+	  return &gotblas_SANDYBRIDGE;
+#endif
+	  else
+	  return &gotoblas_NEHALEM;
+#endif			
+        }
+        return NULL;  
       case 9:
       case 8:
 	if (model == 14 ) { // Kaby Lake

From 1f9e4f319327dd53d1243edb3a812c5a2366a938 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 20:46:36 +0200
Subject: [PATCH 12/24] Handle special case of gfortran+clang+OpenMP

---
 ctest/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ctest/Makefile b/ctest/Makefile
index 6eda43863..569a5dda3 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -102,7 +102,13 @@ clean ::
 	rm -f x*
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
-CEXTRALIB =
+ifeq ($(USE_OPENMP), 1)
+ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB = -lomp
+endif
+endif
+endif
 
 # Single real
 xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

From 6a5ab083b7e78458861b197b8e98b2506345d6d7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 20:47:33 +0200
Subject: [PATCH 13/24] Handle special case of gfortran+clang+OpenMP

---
 test/Makefile | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/Makefile b/test/Makefile
index 65fb6f438..074411b05 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -122,8 +122,13 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
-CEXTRALIB =
-
+ifeq ($(USE_OPENMP), 1)
+ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB = -lomp
+endif
+endif
+endif
 
 sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)

From 10b70c904d9e3b610d35f1efe8d89888da4011bb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 20:53:19 +0200
Subject: [PATCH 14/24] Handle erroneous user settings NOFORTRAN=0 and
 NO_FORTRAN

---
 Makefile | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Makefile b/Makefile
index 56b4426f8..728567f80 100644
--- a/Makefile
+++ b/Makefile
@@ -21,6 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1)
 RELA = re_lapack
 endif
 
+ifeq ($(NOFORTRAN), 0)
+undefine NOFORTRAN
+endif
+
+ifeq ($(NO_FORTRAN), 1)
+undefine NO_FORTRAN
+NOFORTRAN=1
+endif
+
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench

From 9369d3e6e5207c6974af162e67d4060ed625c322 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 23:28:06 +0200
Subject: [PATCH 15/24] Modify NOFORTRAN tests to always check the value; fix
 rewriting of NO_FORTRAN

---
 Makefile | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index 728567f80..4760be0be 100644
--- a/Makefile
+++ b/Makefile
@@ -21,13 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1)
 RELA = re_lapack
 endif
 
-ifeq ($(NOFORTRAN), 0)
-undefine NOFORTRAN
-endif
-
 ifeq ($(NO_FORTRAN), 1)
-undefine NO_FORTRAN
-NOFORTRAN=1
+define NOFORTRAN
+1
+endef
+define NO_LAPACK
+1
+endef
+export NOFORTRAN
+export NO_LAPACK
 endif
 
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
@@ -56,7 +58,7 @@ endif
 endif
 
 	@echo "  C compiler       ... $(C_COMPILER)  (command line : $(CC))"
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	@echo "  Fortran compiler ... $(F_COMPILER)  (command line : $(FC))"
 endif
 ifneq ($(OSNAME), AIX)
@@ -117,7 +119,7 @@ endif
 endif
 
 tests :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	touch $(LIBNAME)
 ifndef NO_FBLAS
 	$(MAKE) -C test all
@@ -219,7 +221,7 @@ netlib :
 
 else
 netlib : lapack_prebuild
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
 endif
@@ -240,7 +242,7 @@ prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
 
 lapack_prebuild :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -283,21 +285,21 @@ endif
 endif
 
 large.tgz :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	if [ ! -a $< ]; then
 	-wget http://www.netlib.org/lapack/timing/large.tgz;
 	fi
 endif
 
 timing.tgz :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	if [ ! -a $< ]; then
 	-wget http://www.netlib.org/lapack/timing/timing.tgz;
 	fi
 endif
 
 lapack-timing : large.tgz timing.tgz
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
 	(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
 	$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING

From 952541e840bddbcdcdfce81aefc09edf7fbfb84f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 20 Jun 2018 13:20:30 +0200
Subject: [PATCH 16/24] Need to use filter-out to handle NOFORTRAN not set

---
 Makefile | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 4760be0be..49dab6484 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ endif
 endif
 
 	@echo "  C compiler       ... $(C_COMPILER)  (command line : $(CC))"
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	@echo "  Fortran compiler ... $(F_COMPILER)  (command line : $(FC))"
 endif
 ifneq ($(OSNAME), AIX)
@@ -119,7 +119,7 @@ endif
 endif
 
 tests :
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	touch $(LIBNAME)
 ifndef NO_FBLAS
 	$(MAKE) -C test all
@@ -221,7 +221,7 @@ netlib :
 
 else
 netlib : lapack_prebuild
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
 endif
@@ -242,7 +242,10 @@ prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
 
 lapack_prebuild :
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+	$(info filter value of NOFORTRAN is:)
+	$(info x$(filter-out $(NOFORTRAN), 1 2)x)
+
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc

From 0c5b7b400b3973d214ce24c566be4446743eacf7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 20 Jun 2018 15:16:19 +0200
Subject: [PATCH 17/24] Add -march=skylake-avx512 to flags if target is skylake
 x

---
 Makefile.x86_64 | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 1ba63278a..677c05d93 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -8,6 +8,13 @@ endif
 endif
 endif
 
+ifeq ($(CORE), SKYLAKEX)
+ifndef NO_AVX512
+CCOMMON_OPT += -march=skylake-avx512
+FCOMMON_OPT += -march=skylake-avx512
+endif
+endif
+
 ifeq ($(OSNAME), Interix)
 ARFLAGS		= -m x64
 endif

From 05978528c3f3c61fb370e1fae0ac3013faaa595e Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Wed, 20 Jun 2018 17:03:18 +0100
Subject: [PATCH 18/24] Avoid declaring arrays of size 0 when making large
 stack allocations.

---
 common_stackalloc.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/common_stackalloc.h b/common_stackalloc.h
index 71fb1a477..ec0fa1611 100644
--- a/common_stackalloc.h
+++ b/common_stackalloc.h
@@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * - large enough to support all architectures and kernel
  * Chosing a too small SIZE will lead to a stack smashing.
  */
-#define STACK_ALLOC(SIZE, TYPE, BUFFER)                                    \
-  /* make it volatile because some function (ex: dgemv_n.S) */             \
-  /* do not restore all register */                                        \
-  volatile int stack_alloc_size = SIZE;                                    \
-  if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE))                    \
-    stack_alloc_size = 0;                                                  \
-  STACK_ALLOC_PROTECT_SET                                                  \
-  TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20)));    \
+#define STACK_ALLOC(SIZE, TYPE, BUFFER)                                        \
+  /* make it volatile because some function (ex: dgemv_n.S) */                 \
+  /* do not restore all register */                                            \
+  volatile int stack_alloc_size = SIZE;                                        \
+  if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
+  STACK_ALLOC_PROTECT_SET                                                      \
+  /* Avoid declaring an array of length 0 */                                   \
+  TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1]                   \
+      __attribute__((aligned(0x20)));                                          \
   BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
 #else
   //Original OpenBLAS/GotoBLAS codes.

From a399d004257b2f43e8211341f924f3a73171b98c Mon Sep 17 00:00:00 2001
From: oon3m0oo <oon3m0oo@users.noreply.github.com>
Date: Wed, 20 Jun 2018 21:04:03 +0100
Subject: [PATCH 19/24] Further improvements to memory.c. (#1625)

- Compiler TLS is now used only used when the compiler supports it
- If compiler TLS is unsupported, we use platform-specific TLS
- Only one variable (an index) is now in TLS
- We only access TLS once per alloc, and never when freeing
- Allocation / release info is now stored within the allocation itself, by
  over-allocating; this saves having external structures do the bookkeeping, and
  reduces some of the redundant data that was being stored (such as addresses)
- We never hit the alloc lock when not using SMP or when using OpenMP (that was
  my fault)
- Now that there are fewer tracking structures I think this is a bit easier to
  read than before
---
 driver/others/memory.c | 399 +++++++++++++++++++++++++----------------
 1 file changed, 243 insertions(+), 156 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 85f790615..ed20cf5cd 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -326,6 +326,8 @@ int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
+static void blas_memory_init();
+
 void openblas_fork_handler()
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
@@ -337,7 +339,7 @@ void openblas_fork_handler()
   // implementation of OpenMP.
 #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
   int err;
-  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
+  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
   if(err != 0)
     openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
 #endif
@@ -415,23 +417,104 @@ int openblas_get_num_threads(void) {
 #endif
 }
 
-struct release_t {
-  void *address;
-  void (*func)(struct release_t *);
-  long attr;
-};
-
 int hugetlb_allocated = 0;
 
 #if defined(OS_WINDOWS)
 #define THREAD_LOCAL __declspec(thread)
-#define UNLIKELY_TO_BE_ZERO(x) (x)
+#define LIKELY_ONE(x) (x)
 #else
 #define THREAD_LOCAL __thread
-#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
+#define LIKELY_ONE(x) (__builtin_expect(x, 1))
 #endif
-static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
-static int THREAD_LOCAL release_pos = 0;
+
+/* Stores information about the allocation and how to release it */
+struct alloc_t {
+  /* Whether this allocation is being used */
+  int used;
+  /* Any special attributes needed when releasing this allocation */
+  int attr;
+  /* Function that can properly release this memory */
+  void (*release_func)(struct alloc_t *);
+  /* Pad to 64-byte alignment */
+  char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
+};
+
+/* Convenience macros for storing release funcs */
+#define STORE_RELEASE_FUNC(address, func)                   \
+  if (address != (void *)-1) {                              \
+    struct alloc_t *alloc_info = (struct alloc_t *)address; \
+    alloc_info->release_func = func;                        \
+  }
+
+#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr)   \
+  if (address != (void *)-1) {                              \
+    struct alloc_t *alloc_info = (struct alloc_t *)address; \
+    alloc_info->release_func = func;                        \
+    alloc_info->attr = attr;                                \
+  }
+
+/* The number of bytes that will be allocated for each buffer. When allocating
+   memory, we store an alloc_t followed by the actual buffer memory. This means
+   that each allocation always has its associated alloc_t, without the need
+   for an auxiliary tracking structure. */
+static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
+
+/* Clang supports TLS from version 2.8 */
+#if defined(__clang__) && __clang_major__ > 2 || \
+    (__clang_minor__ == 2 || __clang_minor__ == 8)
+#define HAS_COMPILER_TLS
+#endif
+
+/* GCC supports TLS from version 4.1 */
+#if !defined(__clang__) && defined(__GNUC__) && \
+    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+#define HAS_COMPILER_TLS
+#endif
+
+/* MSVC supports TLS from version 2005 */
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+#define HAS_COMPILER_TLS
+#endif
+
+/* Versions of XCode before 8 did not properly support TLS */
+#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
+#undef HAS_COMPILER_TLS
+#endif
+
+/* Android NDK's before version 12b did not support TLS */
+#if defined(__ANDROID__) && defined(__clang__)
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+    defined(__NDK_MINOR__) &&                                               \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef HAS_COMPILER_TLS
+#endif
+#endif
+
+/* Holds pointers to allocated memory */
+#if defined(SMP) && !defined(USE_OPENMP)
+/* This is the number of threads than can be spawned by the server, which is the
+   server plus the number of threads in the thread pool */
+#  define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER
+static int next_memory_table_pos = 0;
+#  if defined(HAS_COMPILER_TLS)
+/* Use compiler generated thread-local-storage */
+static int THREAD_LOCAL local_memory_table_pos = 0;
+#  else
+/* Use system-dependent thread-local-storage */
+#    if defined(OS_WINDOWS)
+static DWORD local_storage_key;
+#    else
+static pthread_key_t local_storage_key;
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* defined(HAS_COMPILER_TLS) */
+#else
+/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
+#  define MAX_ALLOCATING_THREADS 1
+#endif /* defined(SMP) && !defined(USE_OPENMP) */
+static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
 static int hot_alloc = 0;
@@ -447,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0;
 static BLASULONG  alloc_lock = 0UL;
 #endif
 
+/* Returns a pointer to the start of the per-thread memory allocation data */
+static __inline struct alloc_t ** get_memory_table() {
+#if defined(SMP) && !defined(USE_OPENMP)
+#  if !defined(HAS_COMPILER_TLS)
+#    if defined(OS_WINDOWS)
+  int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
+#    else
+  int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* !defined(HAS_COMPILER_TLS) */
+  if (!local_memory_table_pos) {
+    LOCK_COMMAND(&alloc_lock);
+    local_memory_table_pos = next_memory_table_pos++;
+    UNLOCK_COMMAND(&alloc_lock);
+    if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
+      printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
+#  if !defined(HAS_COMPILER_TLS)
+#    if defined(OS_WINDOWS)
+    ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
+#    else
+    pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* !defined(HAS_COMPILER_TLS) */
+  }
+  return local_memory_table[local_memory_table_pos];
+#else
+  return local_memory_table[0];
+#endif /* defined(SMP) && !defined(USE_OPENMP) */
+}
+
 #ifdef ALLOC_MMAP
 
-static void alloc_mmap_free(struct release_t *release){
+static void alloc_mmap_free(struct alloc_t *alloc_info){
 
-  if (munmap(release -> address, BUFFER_SIZE)) {
+  if (munmap(alloc_info, allocation_block_size)) {
     printf("OpenBLAS : munmap failed\n");
   }
 }
@@ -465,22 +578,18 @@ static void *alloc_mmap(void *address){
 
   if (address){
     map_address = mmap(address,
-		       BUFFER_SIZE,
+		       allocation_block_size,
 		       MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
   } else {
     map_address = mmap(address,
-		       BUFFER_SIZE,
+		       allocation_block_size,
 		       MMAP_ACCESS, MMAP_POLICY, -1, 0);
   }
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_mmap_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
 
 #ifdef OS_LINUX
-  my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+  my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
   return map_address;
@@ -533,25 +642,25 @@ static void *alloc_mmap(void *address){
 
   if (address){
     /* Just give up use advanced operation */
-    map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
+    map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
 
 #ifdef OS_LINUX
-    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
   } else {
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
     if (hot_alloc == 0) {
-      map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
+      map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
 
 #ifdef OS_LINUX
-      my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+      my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
     } else {
 #endif
 
-      map_address = mmap(NULL, BUFFER_SIZE * SCALING,
+      map_address = mmap(NULL, allocation_block_size * SCALING,
 			 MMAP_ACCESS, MMAP_POLICY, -1, 0);
 
       if (map_address != (void *)-1) {
@@ -559,7 +668,7 @@ static void *alloc_mmap(void *address){
 #ifdef OS_LINUX
 #ifdef DEBUG
 		  int ret=0;
-		  ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
+		  ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
 		  if(ret==-1){
 			  int errsv=errno;
 			  perror("OpenBLAS alloc_mmap:");
@@ -567,7 +676,7 @@ static void *alloc_mmap(void *address){
 		  }
 
 #else
-		  my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
+		  my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 #endif
 
@@ -575,7 +684,7 @@ static void *alloc_mmap(void *address){
 	allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
 
 	start   = (BLASULONG)map_address;
-	current = (SCALING - 1) * BUFFER_SIZE;
+	current = (SCALING - 1) * allocation_block_size;
 
 	while(current > 0) {
 	  *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
@@ -590,7 +699,7 @@ static void *alloc_mmap(void *address){
 	best = (BLASULONG)-1;
 	best_address = map_address;
 
-	while ((start + allocsize  < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
+	while ((start + allocsize  < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
 
 	  current = run_bench(start, allocsize);
 
@@ -606,7 +715,7 @@ static void *alloc_mmap(void *address){
       if ((BLASULONG)best_address > (BLASULONG)map_address)
 	munmap(map_address,  (BLASULONG)best_address - (BLASULONG)map_address);
 
-      munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
+      munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
 
       map_address = best_address;
 
@@ -619,11 +728,7 @@ static void *alloc_mmap(void *address){
   }
 #endif
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_mmap_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
 
   return map_address;
 }
@@ -635,9 +740,9 @@ static void *alloc_mmap(void *address){
 
 #ifdef ALLOC_MALLOC
 
-static void alloc_malloc_free(struct release_t *release){
+static void alloc_malloc_free(struct alloc_t *alloc_info){
 
-  free(release -> address);
+  free(alloc_info);
 
 }
 
@@ -645,15 +750,11 @@ static void *alloc_malloc(void *address){
 
   void *map_address;
 
-  map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
+  map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
 
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_malloc_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
 
   return map_address;
 
@@ -670,24 +771,20 @@ void *qfree (void *address);
 #define QCOMMS    0x2
 #define QFAST     0x4
 
-static void alloc_qalloc_free(struct release_t *release){
+static void alloc_qalloc_free(struct alloc_t *alloc_info){
 
-  qfree(release -> address);
+  qfree(alloc_info);
 
 }
 
 static void *alloc_qalloc(void *address){
   void *map_address;
 
-  map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
+  map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
 
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_qalloc_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
 
   return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
 }
@@ -696,9 +793,9 @@ static void *alloc_qalloc(void *address){
 
 #ifdef ALLOC_WINDOWS
 
-static void alloc_windows_free(struct release_t *release){
+static void alloc_windows_free(struct alloc_t *alloc_info){
 
-  VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
+  VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
 
 }
 
@@ -706,17 +803,13 @@ static void *alloc_windows(void *address){
   void *map_address;
 
   map_address  = VirtualAlloc(address,
-			      BUFFER_SIZE,
+			      allocation_block_size,
 			      MEM_RESERVE | MEM_COMMIT,
 			      PAGE_READWRITE);
 
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_windows_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_windows_free);
 
   return map_address;
 }
@@ -728,13 +821,14 @@ static void *alloc_windows(void *address){
 #define DEVICEDRIVER_NAME "/dev/mapper"
 #endif
 
-static void alloc_devicedirver_free(struct release_t *release){
+static void alloc_devicedirver_free(struct alloc_t *alloc_info){
 
-  if (munmap(release -> address, BUFFER_SIZE)) {
+  int attr = alloc_info -> attr;
+  if (munmap(address, allocation_block_size)) {
     printf("OpenBLAS : Bugphysarea unmap failed.\n");
   }
 
-  if (close(release -> attr)) {
+  if (close(attr)) {
     printf("OpenBLAS : Bugphysarea close failed.\n");
   }
 
@@ -751,17 +845,12 @@ static void *alloc_devicedirver(void *address){
 
   }
 
-  map_address = mmap(address, BUFFER_SIZE,
+  map_address = mmap(address, allocation_block_size,
 		     PROT_READ | PROT_WRITE,
 		     MAP_FILE | MAP_SHARED,
 		     fd, 0);
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].attr    = fd;
-    release_info[release_pos].func    = alloc_devicedirver_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
 
   return map_address;
 }
@@ -770,9 +859,9 @@ static void *alloc_devicedirver(void *address){
 
 #ifdef ALLOC_SHM
 
-static void alloc_shm_free(struct release_t *release){
+static void alloc_shm_free(struct alloc_t *alloc_info){
 
-  if (shmdt(release -> address)) {
+  if (shmdt(alloc_info)) {
     printf("OpenBLAS : Shared memory unmap failed.\n");
     }
 }
@@ -781,22 +870,21 @@ static void *alloc_shm(void *address){
   void *map_address;
   int shmid;
 
-  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
+  shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
 
   map_address = (void *)shmat(shmid, address, 0);
 
   if (map_address != (void *)-1){
 
 #ifdef OS_LINUX
-    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
     shmctl(shmid, IPC_RMID, 0);
 
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].attr    = shmid;
-    release_info[release_pos].func    = alloc_shm_free;
-    release_pos ++;
+    struct alloc_t *alloc_info = (struct alloc_t *)map_address;
+    alloc_info->release_func = alloc_shm_free;
+    alloc_info->attr = shmid;
   }
 
   return map_address;
@@ -804,23 +892,23 @@ static void *alloc_shm(void *address){
 
 #if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS
 
-static void alloc_hugetlb_free(struct release_t *release){
+static void alloc_hugetlb_free(struct alloc_t *alloc_info){
 
 #if defined(OS_LINUX) || defined(OS_AIX)
-  if (shmdt(release -> address)) {
+  if (shmdt(alloc_info)) {
     printf("OpenBLAS : Hugepage unmap failed.\n");
   }
 #endif
 
 #ifdef __sun__
 
-  munmap(release -> address, BUFFER_SIZE);
+  munmap(alloc_info, allocation_block_size);
 
 #endif
 
 #ifdef OS_WINDOWS
 
-  VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
+  VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
 
 #endif
 
@@ -833,7 +921,7 @@ static void *alloc_hugetlb(void *address){
 #if defined(OS_LINUX) || defined(OS_AIX)
   int shmid;
 
-  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
+  shmid = shmget(IPC_PRIVATE, allocation_block_size,
 #ifdef OS_LINUX
 		 SHM_HUGETLB |
 #endif
@@ -846,7 +934,7 @@ static void *alloc_hugetlb(void *address){
     map_address = (void *)shmat(shmid, address, SHM_RND);
 
 #ifdef OS_LINUX
-    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
     if (map_address != (void *)-1){
@@ -863,7 +951,7 @@ static void *alloc_hugetlb(void *address){
   mha.mha_pagesize = HUGE_PAGESIZE;
   memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
 
-  map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
+  map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
 #endif
 
 #ifdef OS_WINDOWS
@@ -887,7 +975,7 @@ static void *alloc_hugetlb(void *address){
   }
 
   map_address  = (void *)VirtualAlloc(address,
-				      BUFFER_SIZE,
+				      allocation_block_size,
 				      MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
 				      PAGE_READWRITE);
 
@@ -898,11 +986,7 @@ static void *alloc_hugetlb(void *address){
 
 #endif
 
-  if (map_address != (void *)-1){
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_hugetlb_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
 
   return map_address;
 }
@@ -914,13 +998,14 @@ static void *alloc_hugetlb(void *address){
 
 static int hugetlb_pid = 0;
 
-static void alloc_hugetlbfile_free(struct release_t *release){
+static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
 
-  if (munmap(release -> address, BUFFER_SIZE)) {
+  int attr = alloc_info -> attr;
+  if (munmap(alloc_info, allocation_block_size)) {
     printf("OpenBLAS : HugeTLBfs unmap failed.\n");
   }
 
-  if (close(release -> attr)) {
+  if (close(attr)) {
     printf("OpenBLAS : HugeTLBfs close failed.\n");
   }
 }
@@ -941,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){
 
   unlink(filename);
 
-  map_address = mmap(address, BUFFER_SIZE,
+  map_address = mmap(address, allocation_block_size,
 		     PROT_READ | PROT_WRITE,
 		     MAP_SHARED,
 		     fd, 0);
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].attr    = fd;
-    release_info[release_pos].func    = alloc_hugetlbfile_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
 
   return map_address;
 }
@@ -964,19 +1044,11 @@ static BLASULONG base_address      = 0UL;
 static BLASULONG base_address      = BASE_ADDRESS;
 #endif
 
-struct memory_t {
-  void *addr;
-  int used;
-#ifndef __64BIT__
-  char dummy[48];
+#if __STDC_VERSION__ >= 201112L
+static _Atomic int memory_initialized = 0;
 #else
-  char dummy[40];
+static volatile int memory_initialized = 0;
 #endif
-};
-
-static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
-
-static int memory_initialized = 0;
 
 /*       Memory allocation routine           */
 /* procpos ... indicates where it comes from */
@@ -984,6 +1056,20 @@ static int memory_initialized = 0;
 /*                1 : Level 2 functions      */
 /*                2 : Thread                 */
 
+static void blas_memory_init(){
+#if defined(SMP) && !defined(USE_OPENMP)
+  next_memory_table_pos = 0;
+#  if !defined(HAS_COMPILER_TLS)
+#    if defined(OS_WINDOWS)
+  local_storage_key = ::TlsAlloc();
+#    else
+  pthread_key_create(&local_storage_key, NULL);
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* defined(HAS_COMPILER_TLS) */
+#endif /* defined(SMP) && !defined(USE_OPENMP) */
+  memset(local_memory_table, 0, sizeof(local_memory_table));
+}
+
 void *blas_memory_alloc(int procpos){
 
   int position;
@@ -1016,14 +1102,17 @@ void *blas_memory_alloc(int procpos){
     NULL,
   };
   void *(**func)(void *address);
+  struct alloc_t * alloc_info;
+  struct alloc_t ** alloc_table;
 
-  if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
-
+  if (!LIKELY_ONE(memory_initialized)) {
+#if defined(SMP) && !defined(USE_OPENMP)
     /* Only allow a single thread to initialize memory system */
     LOCK_COMMAND(&alloc_lock);
 
     if (!memory_initialized) {
-
+#endif
+      blas_memory_init();
 #ifdef DYNAMIC_ARCH
       gotoblas_dynamic_init();
 #endif
@@ -1044,8 +1133,10 @@ void *blas_memory_alloc(int procpos){
 
       memory_initialized = 1;
 
+#if defined(SMP) && !defined(USE_OPENMP)
     }
     UNLOCK_COMMAND(&alloc_lock);
+#endif
   }
 
 #ifdef DEBUG
@@ -1053,9 +1144,9 @@ void *blas_memory_alloc(int procpos){
 #endif
 
   position = 0;
-
+  alloc_table = get_memory_table();
   do {
-      if (!memory[position].used) goto allocation;
+      if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
     position ++;
 
   } while (position < BUFFERS_PER_THREAD);
@@ -1068,9 +1159,8 @@ void *blas_memory_alloc(int procpos){
   printf("  Position -> %d\n", position);
 #endif
 
-  memory[position].used = 1;
-
-  if (!memory[position].addr) {
+  alloc_info = alloc_table[position];
+  if (!alloc_info) {
     do {
 #ifdef DEBUG
       printf("Allocation Start : %lx\n", base_address);
@@ -1082,7 +1172,7 @@ void *blas_memory_alloc(int procpos){
 
       while ((func != NULL) && (map_address == (void *) -1)) {
 
-	map_address = (*func)((void *)base_address);
+  map_address = (*func)((void *)base_address);
 
 #ifdef ALLOC_DEVICEDRIVER
 	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
@@ -1110,23 +1200,24 @@ void *blas_memory_alloc(int procpos){
 #endif
       if (((BLASLONG) map_address) == -1) base_address = 0UL;
 
-      if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
+      if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
 
     } while ((BLASLONG)map_address == -1);
 
-    memory[position].addr = map_address;
+    alloc_table[position] = alloc_info = map_address;
 
 #ifdef DEBUG
-    printf("  Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
+    printf("  Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
 #endif
   }
 
 #ifdef DEBUG
-  printf("Mapped   : %p  %3d\n\n",
-	  (void *)memory[position].addr, position);
+  printf("Mapped   : %p  %3d\n\n", (void *)alloc_info, position);
 #endif
 
-  return (void *)memory[position].addr;
+  alloc_info->used = 1;
+
+  return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
 
  error:
   printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
@@ -1134,25 +1225,19 @@ void *blas_memory_alloc(int procpos){
   return NULL;
 }
 
-void blas_memory_free(void *free_area){
-
+void blas_memory_free(void *buffer){
+#ifdef DEBUG
   int position;
+  struct alloc_t ** alloc_table;
+#endif
+  /* Since we passed an offset pointer to the caller, get back to the actual allocation */
+  struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
 
 #ifdef DEBUG
-  printf("Unmapped Start : %p ...\n", free_area);
+  printf("Unmapped Start : %p ...\n", alloc_info);
 #endif
 
-  position = 0;
-  while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
-    position++;
-
-  if (memory[position].addr != free_area) goto error;
-
-#ifdef DEBUG
-  printf("  Position : %d\n", position);
-#endif
-
-  memory[position].used = 0;
+  alloc_info->used = 0;
 
 #ifdef DEBUG
   printf("Unmap Succeeded.\n\n");
@@ -1160,12 +1245,13 @@ void blas_memory_free(void *free_area){
 
   return;
 
- error:
-  printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);
-
 #ifdef DEBUG
-  for (position = 0; position < BUFFERS_PER_THREAD; position++)
-    printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
+  alloc_table = get_memory_table();
+  for (position = 0; position < BUFFERS_PER_THREAD; position++){
+    if (alloc_table[position]) {
+      printf("%4ld  %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
+    }
+  }
 #endif
   return;
 }
@@ -1182,14 +1268,20 @@ void blas_memory_free_nolock(void * map_address) {
 
 void blas_shutdown(void){
 
-  int pos;
+  int pos, thread;
 
 #ifdef SMP
   BLASFUNC(blas_thread_shutdown)();
 #endif
 
-  for (pos = 0; pos < release_pos; pos ++) {
-    release_info[pos].func(&release_info[pos]);
+  for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
+    for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
+      struct alloc_t *alloc_info = local_memory_table[thread][pos];
+      if (alloc_info) {
+        alloc_info->release_func(alloc_info);
+        alloc_info = (void *)0;
+      }
+    }
   }
 
 #ifdef SEEK_ADDRESS
@@ -1198,11 +1290,6 @@ void blas_shutdown(void){
   base_address      = BASE_ADDRESS;
 #endif
 
-  for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
-    memory[pos].addr   = (void *)0;
-    memory[pos].used   = 0;
-  }
-
   return;
 }
 
@@ -1226,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
   size_t size;
   BLASULONG buffer;
 
-  size   = BUFFER_SIZE - PAGESIZE;
+  size   = allocation_block_size - PAGESIZE;
   buffer = (BLASULONG)sa + GEMM_OFFSET_A;
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
@@ -1247,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
   UNLOCK_COMMAND(&init_lock);
 #endif
 
-  size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
+  size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
   buffer = (BLASULONG)sa + GEMM_OFFSET_A;
 
   while (size > 0) {

From 28c28ed275df2fd812bcdc75fdc04cdb6d9580b3 Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Thu, 21 Jun 2018 11:13:57 +0100
Subject: [PATCH 20/24] Fix data races reported by TSAN.

---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index ed20cf5cd..7eff16ce3 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -543,9 +543,9 @@ static __inline struct alloc_t ** get_memory_table() {
   if (!local_memory_table_pos) {
     LOCK_COMMAND(&alloc_lock);
     local_memory_table_pos = next_memory_table_pos++;
-    UNLOCK_COMMAND(&alloc_lock);
     if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
       printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
+    UNLOCK_COMMAND(&alloc_lock);
 #  if !defined(HAS_COMPILER_TLS)
 #    if defined(OS_WINDOWS)
     ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);

From 2aa0a5804e381f89a53fdbef9bd51e8af23c8940 Mon Sep 17 00:00:00 2001
From: oon3m0oo <oon3m0oo@users.noreply.github.com>
Date: Thu, 21 Jun 2018 17:47:45 +0100
Subject: [PATCH 21/24] Use BLAS rather than CBLAS in test_fork.c (#1626)

This is handy for people not using lapack.
---
 utest/CMakeLists.txt |  2 --
 utest/Makefile       |  2 --
 utest/test_fork.c    | 22 +++++++++++++---------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 77a42d84f..1b426afe7 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -25,7 +25,6 @@ endif ()
 
 # known to hang with the native Windows and Android threads
 # FIXME needs checking if this works on any of the other platforms
-if (NOT NO_CBLAS)
 if (NOT USE_OPENMP)
 if (OS_CYGWIN_NT OR OS_LINUX)
 set(OpenBLAS_utest_src
@@ -34,7 +33,6 @@ set(OpenBLAS_utest_src
   )
 endif()
 endif()
-endif()
 
 if (NOT NO_LAPACK)
 set(OpenBLAS_utest_src
diff --git a/utest/Makefile b/utest/Makefile
index e071540dc..e40b3c6db 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -17,13 +17,11 @@ endif
 
 #this does not work with OpenMP nor with native Windows or Android threads
 # FIXME TBD if this works on OSX, SunOS, POWER and zarch
-ifneq ($(NO_CBLAS), 1)
 ifndef USE_OPENMP
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
 OBJS += test_fork.o
 endif
 endif
-endif
 
 all : run_test
 
diff --git a/utest/test_fork.c b/utest/test_fork.c
index 9e0244305..9fc51287c 100644
--- a/utest/test_fork.c
+++ b/utest/test_fork.c
@@ -13,9 +13,9 @@ met:
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.
-   3. Neither the name of the OpenBLAS project nor the names of 
-      its contributors may be used to endorse or promote products 
-      derived from this software without specific prior written 
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -48,11 +48,13 @@ void* xmalloc(size_t n)
     }
 }
 
-void check_dgemm(double *a, double *b, double *result, double *expected, int n)
+void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
 {
+    char trans1 = 'T';
+    char trans2 = 'N';
+    double zerod = 0, oned = 1;
     int i;
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
-        1.0, a, n, b, n, 0.0, result, n);
+    BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
     for(i = 0; i < n * n; ++i) {
         ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
     }
@@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n)
 
 CTEST(fork, safety)
 {
-    int n = 1000;
+    blasint n = 1000;
     int i;
 
     double *a, *b, *c, *d;
@@ -84,8 +86,10 @@ CTEST(fork, safety)
 
     // Compute a DGEMM product in the parent process prior to forking to
     // ensure that the OpenBLAS thread pool is initialized.
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
-       1.0, a, n, b, n, 0.0, c, n);
+    char trans1 = 'T';
+    char trans2 = 'N';
+    double zerod = 0, oned = 1;
+    BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n);
 
     fork_pid = fork();
     if (fork_pid == -1) {

From 9cf22b7d9129e186a1ee941fbab8e45328c50b61 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Jun 2018 13:27:30 +0200
Subject: [PATCH 22/24] Build cblas_iXamin interfaces

---
 interface/Makefile | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 9b2b93b83..20ec74e9e 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
 	  idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
 
 CSBLAS1OBJS   = \
-	cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
+	cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
@@ -277,7 +277,7 @@ CSBLAS3OBJS   = \
 	cblas_sgeadd.$(SUFFIX)
 
 CDBLAS1OBJS   = \
-	cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
+	cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
@@ -294,7 +294,7 @@ CDBLAS3OBJS   += \
         cblas_dgeadd.$(SUFFIX) 
 
 CCBLAS1OBJS   = \
-	cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX)  cblas_caxpy.$(SUFFIX) \
+	cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX)  cblas_caxpy.$(SUFFIX) \
 	cblas_ccopy.$(SUFFIX) \
 	cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
@@ -320,7 +320,7 @@ CCBLAS3OBJS   = \
 
 
 CZBLAS1OBJS   = \
-	cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
+	cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
 	cblas_zcopy.$(SUFFIX) \
 	cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
@@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
 cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
+cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
 

From eb71d61c7cb6640e66a5239d1113de8a8c1477df Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Jun 2018 13:31:09 +0200
Subject: [PATCH 23/24] Expose CBLAS interface to BLAS extensions iXamin

---
 cblas.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cblas.h b/cblas.h
index 89f78c133..6461f4209 100644
--- a/cblas.h
+++ b/cblas.h
@@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

From 01440685379f11f158c5f612cf15fc279eb16c88 Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Mon, 25 Jun 2018 13:53:11 +0100
Subject: [PATCH 24/24] Rewrite &= -> = and simplify the initial blocking
 phase.

---
 driver/level3/level3_thread.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index aeb5e6ed4..ee3e3b9a9 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
     div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
     for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
 
-      /* Make sure if no one is using workspace */
-      START_RPCC();
-      for (i = 0; i < args -> nthreads; i++)
-	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
-      STOP_RPCC(waiting1);
-
 #if defined(FUSED_GEMM) && !defined(TIMING)
 
       /* Fused operation to copy region of B into workspace and apply kernel */
@@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       }
 #endif
 
-      /* Set flag so other threads can access local region of B */
-      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
+      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
+        /* Make sure if no one is using workspace */
+        START_RPCC();
+        while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
+        STOP_RPCC(waiting1);
+        /* Set flag so other threads can access local region of B */
         job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
-      WMB;
+        WMB;
+      }
     }
 
     /* Get regions of B from other threads and apply kernel */
@@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
         /* Clear synchronization flag if this thread is done with other region of B */
 	if (m_to - m_from == min_i) {
-	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
 	  WMB;
 	}
       }
     } while (current != mypos);
 
-    /* Iterate through steps of m 
+    /* Iterate through steps of m
      * Note: First step has already been finished */
     for(is = m_from + min_i; is < m_to; is += min_i){
       min_i = m_to - is;
@@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
 			   c, ldc, is, js);
           STOP_RPCC(kernel);
-          
+
 #ifdef TIMING
           ops += 2 * min_i * MIN(range_n[current + 1]  - js, div_n) * min_l;
 #endif
-          
+
           /* Clear synchronization flag if this thread is done with region of B */
           if (is + min_i >= m_to) {
-            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
             WMB;
           }
 	}