Merge pull request #1 from xianyi/develop

rebase
2019-05-08 19:46:44 +02:00 · 2019-05-08 19:46:44 +02:00 · ede3cab6e6
parent 91943b7325 ad20ceaa68
commit ede3cab6e6
46 changed files with 8469 additions and 78 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -25,6 +25,15 @@ matrix:
        - TARGET_BOX=LINUX64
        - BTYPE="BINARY=64"

+    - <<: *test-ubuntu
+      os: linux-ppc64le
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+      env:
+        # for matrix annotation only
+        - TARGET_BOX=PPC64LE_LINUX
+        - BTYPE="BINARY=64 USE_OPENMP=1"
+
    - <<: *test-ubuntu
      env:
        - TARGET_BOX=LINUX64
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 6.dev)
+set(OpenBLAS_PATCH_VERSION 7.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

 # Adhere to GNU filesystem layout conventions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -167,4 +167,7 @@ In chronological order:
  * [2017-02-26] ztrmm kernel for IBM z13
  * [2017-03-13] strmm and ctrmm kernel for IBM z13
  * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
-
+  * [2018-03-07] added missing Blas Level 1-2  (double precision) simd codes
+  * [2019-02-01] added missing Blas Level-1,2 (single precision)  simd codes
+  * [2019-03-14] power9 dgemm/dtrmm kernel
+  * [2019-04-29] power9 sgemm/strmm kernel 
--- a/Changelog.txt
+++ b/Changelog.txt
@ -1,4 +1,82 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.6
+29-Apr-2019
+
+common:
+	* the build tools now check that a given cpu TARGET is actually valid
+	* the build-time check of system features (c_check) has been made
+  	  less dependent on particular perl features (this should mainly
+  	  benefit building on Windows)
+	* several problem with the ReLAPACK integration were fixed,
+	  including INTERFACE64 support and building a shared library
+	* building with CMAKE on BSD systems was improved
+	* a non-absolute SUM function was added based on the
+  	  existing optimized code for ASUM
+	* CBLAS interfaces to the IxMIN and IxMAX functions were added
+	* a name clash between LAPACKE and BOOST headers was resolved
+	* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
+	  kernels
+	* a crash on thread (key) deletion with the USE_TLS=1 memory management
+	  option was fixed
+	* restored several earlier fixes, in particular for OpenMP performance,
+  	  building on BSD, and calling fork on CYGWIN, which had inadvertently
+  	  been dropped in the 0.3.3 rewrite of the memory management code.
+
+x86_64:
+	* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
+	* building with old versions of MSVC was fixed
+	* it is now possible to build a static library on Windows with CMAKE
+	* accessing environment variables on CYGWIN at run time was fixed
+	* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
+	* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
+	* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
+  	  with CMAKE as well
+	* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
+	* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
+	* assembly bugs involving undeclared modification of input operands were fixed
+  	  in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
+	  Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
+	  test failures or segfaults when compiled with recent versions of gcc from 8 onward.
+	* a similar bug was fixed in the blas_quickdivide code used to split workloads
+	  in most functions
+	* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
+	* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
+	  environment does not support AVX512
+	* improved GEMM performance on ZEN targets
+
+x86:
+	* build failures caused by the recently added checks for AVX512 were fixed
+	* an inline assembly bug involving undeclared modification of an input argument was
+  	  fixed in the blas_quickdivide code used to split workloads in most functions
+	* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
+
+MIPS32:
+	* a bug in the IMIN implementation made it return the result of IMAX
+
+POWER:
+	* single precision BLAS1/2 functions have received optimized POWER8 kernels
+	* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
+	* building on PPC970 systems under OSX Leopard or Tiger is now supported
+	* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
+	* building a shared library on AIX is now supported for POWER6
+	* DYNAMIC_ARCH support has been added for POWER6 and newer
+
+ARMv7:
+	* corrected xDOT behaviour with zero INC_X or INC_Y
+	* a bug in the IMIN implementation made it return the result of IMAX
+
+ARMv8:
+	* added support for HiSilicon TSV110 cpus
+	* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
+	* cross-compilation with CMAKE now works again
+	* a bug in the IMIN implementation made it return the result of IMAX
+	* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
+
+IBM Z:
+	* optimized microkernels for single precicion BLAS1/2 functions have been added
+	  for both Z13 and Z14
+
 ====================================================================
 Version 0.3.5
 31-Dec-2018
--- a/Makefile.rule
+++ b/Makefile.rule
@ -3,7 +3,7 @@
 #

 # This library's version
-VERSION = 0.3.6.dev
+VERSION = 0.3.7.dev

 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -181,17 +181,17 @@ NO_AFFINITY = 1
 # time out to improve performance. This number should be from 4 to 30
 # which corresponds to (1 << n) cycles. For example, if you set to 26,
 # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
-# system). Also you can control this mumber by THREAD_TIMEOUT
+# system). Also you can control this number by THREAD_TIMEOUT
 # CCOMMON_OPT	+= -DTHREAD_TIMEOUT=26

-# Using special device driver for mapping physically contigous memory
+# Using special device driver for mapping physically contiguous memory
 # to the user space. If bigphysarea is enabled, it will use it.
 # DEVICEDRIVER_ALLOCATION = 1

 # If you need to synchronize FP CSR between threads (for x86/x86_64 only).
 # CONSISTENT_FPCSR = 1

-# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
+# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
 # with single thread. (Actually in recent versions this is a factor proportional to the
 # number of floating point operations necessary for the given problem size, no longer
 # an individual dimension). You can use this setting to avoid the overhead of multi-
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n

 OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.

-Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
+Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.

 ## Binary Packages

@ -22,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge

 ## Installation from Source

-Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
+Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
 using Git from https://github.com/xianyi/OpenBLAS.git.

 ### Dependencies
@ -63,9 +63,7 @@ A debug version can be built using `make DEBUG=1`.

 ### Compile with MASS support on Power CPU (optional)

-The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
-consists of a set of mathematical functions for C, C++, and Fortran applications that are
-are tuned for optimum performance on POWER architectures.
+The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
 OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
 The library can be installed as shown:

@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`.
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
 - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
+- **AMD ZEN**: Uses Haswell codes with some optimizations.

 #### MIPS64

@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`.

 #### PPC/PPC64

- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
+- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
+- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. 

 #### IBM zEnterprise System

 - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
+- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)

 ### Supported OS

--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -0,0 +1,40 @@
+# Starter pipeline
+# Start with a minimal pipeline that you can customize to build and deploy your code.
+# Add steps that build, run tests, deploy, and more:
+# https://aka.ms/yaml
+
+trigger:
+- master
+
+pool:
+  vmImage: 'ubuntu-latest'
+
+steps:
+- script: echo Hello, world!
+  displayName: 'Run a one-line script'
+
+#- script: |
+#    docker run --rm --privileged multiarch/qemu-user-static:register --reset
+#    ls /proc/sys/fs/binfmt_misc/
+#  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
+#  displayName: 'Configure binfmt_misc'
+
+- script: |
+    echo "FROM openblas/alpine:arm32
+        COPY . /tmp/openblas
+        RUN mkdir /tmp/openblas/build                             &&  \
+            cd /tmp/openblas/build                                &&  \
+            CC=gcc cmake -D DYNAMIC_ARCH=OFF                  \
+                                 -D TARGET=ARMV6             \
+                                 -D BUILD_SHARED_LIBS=ON              \
+                                 -D BUILD_WITHOUT_LAPACK=ON           \
+                                 -D BUILD_WITHOUT_CBLAS=ON            \
+                                 -D CMAKE_BUILD_TYPE=Release ../  &&  \
+            cmake --build ." > Dockerfile
+    docker build .
+  displayName: Run ARMV6 docker build
+
+#- script: |
+#    echo Add other tasks to build, test, and deploy your project.
+#    echo See https://aka.ms/yaml
+#  displayName: 'Run a multi-line script'
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@ -1,7 +1,7 @@
 # helper functions for the kernel CMakeLists.txt


-# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
+# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
 macro(SetDefaultL1)
  set(SAMAXKERNEL amax.S)
  set(DAMAXKERNEL amax.S)
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -283,7 +283,7 @@ endif ()

 set(KERNELDIR	"${PROJECT_SOURCE_DIR}/kernel/${ARCH}")

-# TODO: nead to convert these Makefiles
+# TODO: need to convert these Makefiles
 # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake

 if (${CORE} STREQUAL "PPC440")
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
  set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
 endfunction ()

-# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
+# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
 # @param sources_in the source files to build from
 # @param defines_in (optional) preprocessor definitions that will be applied to all objects
 # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
--- a/common_stackalloc.h
+++ b/common_stackalloc.h
@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * SIZE must be carefully chosen to be:
 * - as small as possible to maximize the number of stack allocation
 * - large enough to support all architectures and kernel
- * Chosing a too small SIZE will lead to a stack smashing.
+ * Choosing a SIZE too small will lead to a stack smashing.
 */
 #define STACK_ALLOC(SIZE, TYPE, BUFFER)                                        \
  /* make it volatile because some function (ex: dgemv_n.S) */                 \
--- a/common_x86.h
+++ b/common_x86.h
@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 #endif

 #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
 #define BARCELONA_OPTIMIZATION
 #endif

--- a/common_x86_64.h
+++ b/common_x86_64.h
@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 #ifdef ASSEMBLER

 #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
 #define BARCELONA_OPTIMIZATION
 #endif

--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@ -577,7 +577,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/ctest/c_dblat1.f
+++ b/ctest/c_dblat1.f
@ -653,7 +653,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/ctest/c_sblat1.f
+++ b/ctest/c_sblat1.f
@ -653,7 +653,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/ctest/c_zblat1.f
+++ b/ctest/c_zblat1.f
@ -577,7 +577,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
 /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when     */
 /* jobs is queued.                                                  */

-/* We need this grobal for cheking if initialization is finished.   */
+/* We need this global for checking if initialization is finished.  */
 int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;

 /* Local Variables */
@ -150,7 +150,7 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));

 #ifdef MONITOR

-/* Monitor is a function to see thread's status for every seconds. */
+/* Monitor is a function to see thread's status for every second. */
 /* Usually it turns off and it's for debugging.                   */

 static pthread_t      monitor_thread;
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@ -50,7 +50,7 @@

 /* This is a thread implementation for Win32 lazy implementation */

-/* Thread server common infomation */
+/* Thread server common information */
 typedef struct{
  CRITICAL_SECTION lock;
  HANDLE filled;
@ -61,7 +61,7 @@ typedef struct{

 } blas_pool_t;

-/* We need this global for cheking if initialization is finished.   */
+/* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;

 /* Local Variables */
--- a/driver/others/init.c
+++ b/driver/others/init.c
@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {

  int mynode = 1;

-  /* if number of threads is larger than inital condition */
+  /* if number of threads is larger than initial condition */
  if (pos < 0) {
      sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
      return 0;
@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
  common -> shmid = pshmid;

  if (common -> magic != SH_MAGIC) {
+
+#if defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 7)
    cpu_set_t *cpusetp;
+#else
+    cpu_set_t cpuset;
+#endif
+#endif    
    int nums;
    int ret;

@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
    }
    CPU_FREE(cpusetp);
 #else
-    ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+    ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
    if (ret!=0) {
        common->num_procs = nums;
    } else {
@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
    int i;
    int n = 0;
    for (i=0;i<nums;i++)
-        if (CPU_ISSET(i,cpusetp)) n++;
+        if (CPU_ISSET(i,&cpuset)) n++;
    common->num_procs = n;
    }
 #else
-    common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+    common->num_procs = CPU_COUNT(&cpuset);
    }
 #endif

--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@ -229,7 +229,7 @@ int get_num_procs(void) {
  n=0;
  #if !__GLIBC_PREREQ(2, 6)
  for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpuset)) n++;
+     if (CPU_ISSET(i,&cpuset)) n++;
  nums=n;
  #else
  nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -1772,7 +1772,7 @@ int get_num_procs(void) {
  n=0;
  #if !__GLIBC_PREREQ(2, 6)
  for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpuset)) n++;
+     if (CPU_ISSET(i,&cpuset)) n++;
  nums=n;
  #else
  nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){

 #ifdef ALLOC_DEVICEDRIVER
 	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
-	    fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
+	    fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
 	}
 #endif

--- a/2
+++ b/2
@ -125,7 +125,7 @@ if ($compiler eq "") {
 	    $openmp = "-openmp";
 	}

-	# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
+	# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
 	$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
 	if ($data =~ / zho_ge__/) {
 	    $need2bu       = 1;
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
  axpby.c
 )

-# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
+# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
 # these all have 'z' sources for complex versions
 set(BLAS2_SOURCES
  gemv.c ger.c
--- a/interface/axpy.c
+++ b/interface/axpy.c
@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
  //disable multi-thread when incx==0 or incy==0
  //In that case, the threads would be dependent.
  //
-  //Temporarily work-around the low performance issue with small imput size &
+  //Temporarily work-around the low performance issue with small input size &
  //multithreads.
  if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
  //disable multi-thread when incx==0 or incy==0
  //In that case, the threads would be dependent.
  //
-  //Temporarily work-around the low performance issue with small imput size &
+  //Temporarily work-around the low performance issue with small input size &
  //multithreads.
  if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@ -3,12 +3,12 @@
 #CGEMM_BETA = ../generic/zgemm_beta.c
 #ZGEMM_BETA = ../generic/zgemm_beta.c

-STRMMKERNEL	= strmm_kernel_16x8_power8.S
+STRMMKERNEL	= sgemm_kernel_power9.S
 DTRMMKERNEL	= dgemm_kernel_power9.S
 CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S

-SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
+SGEMMKERNEL    =  sgemm_kernel_power9.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
--- a/kernel/power/icamax.c
+++ b/kernel/power/icamax.c
@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
 static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 

    BLASLONG index;
-    BLASLONG i;
+    BLASLONG i=0;
 #if  defined(USE_MASK_PERMUTATIONS)    
    register __vector unsigned int static_index0 = {0,1,2,3};
 #else
--- a/kernel/power/icamin.c
+++ b/kernel/power/icamin.c
@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 

    BLASLONG index;
-    BLASLONG i;
+    BLASLONG i=0;
    register __vector unsigned int static_index0 = {0,1,2,3};
    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@ -0,0 +1,286 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	T11	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "sgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f 
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+	stxv    v20,  288(SP)
+	stxv    v21,  304(SP)
+	stxv    v22,  320(SP)
+	stxv    v23,  336(SP)
+	stxv    v24,  352(SP)
+	stxv    v25,  368(SP)
+	stxv    v26,  384(SP)
+	stxv    v27,  400(SP)
+	stxv    v28,  416(SP)
+	stxv    v29,  432(SP)
+	stxv    v30,  448(SP)
+	stxv    v31,  464(SP)
+
+ 
+
+#if defined(TRMMKERNEL) 
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, 2
+
+
+/*	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+*/
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+  xscvdpspn alpha_r,vs1 
+	xxspltw   alpha_r,alpha_r,0
+ 
+ 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	ori T2, T2, perm_const2@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, perm_const2@h
+	ori T2, T2, perm_const2@l 
+
+	lis T1, perm_const1@highest
+	ori T1, T1, perm_const1@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, perm_const1@h
+	ori T1, T1, perm_const1@l
+
+	mtvsrdd permute_mask,T2,T1
+
+	lis T2, save_permute_12@highest
+	ori T2, T2, save_permute_12@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, save_permute_12@h
+	ori T2, T2, save_permute_12@l 
+
+	lis T1, save_permute_11@highest
+	ori T1, T1, save_permute_11@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, save_permute_11@h
+	ori T1, T1, save_permute_11@l
+
+	mtvsrdd save_permute_1,T2,T1	
+
+	lis T2, save_permute_22@highest
+	ori T2, T2, save_permute_22@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, save_permute_22@h
+	ori T2, T2, save_permute_22@l 
+
+	lis T1, save_permute_21@highest
+	ori T1, T1, save_permute_21@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, save_permute_21@h
+	ori T1, T1, save_permute_21@l
+
+	mtvsrdd save_permute_2,T2,T1	
+
+#include "sgemm_logic_power9.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+	lxv    v20,  288(SP)
+	lxv    v21,  304(SP)
+	lxv    v22,  320(SP)
+	lxv    v23,  336(SP)
+	lxv    v24,  352(SP)
+	lxv    v25,  368(SP)
+	lxv    v26,  384(SP)
+	lxv    v27,  400(SP)
+	lxv    v28,  416(SP)
+	lxv    v29,  432(SP)
+	lxv    v30,  448(SP)
+	lxv    v31,  464(SP)
+
+ 
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@ -9,8 +9,8 @@ SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c

 #DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c

-DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
-DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
+#DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
+#DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
 DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c

--- a/param.h
+++ b/param.h
@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2

-#define SGEMM_DEFAULT_P  1280
+#define SGEMM_DEFAULT_P 640
 #define DGEMM_DEFAULT_P  128
 #define CGEMM_DEFAULT_P  640
 #define ZGEMM_DEFAULT_P  320

-#define SGEMM_DEFAULT_Q  640
+#define SGEMM_DEFAULT_Q 1408
 #define DGEMM_DEFAULT_Q  384
 #define CGEMM_DEFAULT_Q  640
 #define ZGEMM_DEFAULT_Q  640
--- a/relapack/config.h
+++ b/relapack/config.h
@ -36,8 +36,8 @@
 // allow malloc in xsygst for improved performance
 #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
 // allow malloc in xsytrf if the passed work buffer is too small
-#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
-
+//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
+#define XSYTRF_ALLOW_MALLOC 0

 ////////////////////////////////
 // LAPACK routine replacement //
--- a/relapack/src/cgbtrf.c
+++ b/relapack/src/cgbtrf.c
@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec(
    }

    // recursion(Ab_BR, ipiv_B)
-    RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+       LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
+       
    if (*info)
        *info += n1;
    // shift pivots
--- a/relapack/src/cgetrf.c
+++ b/relapack/src/cgetrf.c
@ -22,7 +22,7 @@ void RELAPACK_cgetrf(
        *info = -1;
    else if (*n < 0)
        *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
        *info = -4;
    if (*info) {
        const blasint minfo = -*info;
--- a/relapack/src/dgbtrf.c
+++ b/relapack/src/dgbtrf.c
@ -1,5 +1,6 @@
 #include "relapack.h"
-#include "stdlib.h"
+#include <stdlib.h>
+#include <stdio.h>
 static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *,
    const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *,
    const blasint *, blasint *);
@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec(
    }

    // recursion(Ab_BR, ipiv_B)
-    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+//    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+        LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
    if (*info)
        *info += n1;
    // shift pivots
--- a/relapack/src/dgetrf.c
+++ b/relapack/src/dgetrf.c
@ -15,16 +15,15 @@ void RELAPACK_dgetrf(
    double *A, const blasint *ldA, blasint *ipiv,
    blasint *info
 ) {
-
    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
        *info = -4;
-    if (*info) {
+    if (*info!=0) {
        const blasint minfo = -*info;
        LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF"));
        return;
--- a/relapack/src/sgbtrf.c
+++ b/relapack/src/sgbtrf.c
@ -55,15 +55,16 @@ void RELAPACK_sgbtrf(

    // Allocate work space
    const blasint n1 = SREC_SPLIT(*n);
-    const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const blasint nWorkl = (kv > n1) ? n1 : kv;
-    const blasint mWorku = (*kl > n1) ? n1 : *kl;
-    const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv );
+    const blasint nWorkl = abs( (kv > n1) ? n1 : kv );
+    const blasint mWorku = abs( (*kl > n1) ? n1 : *kl );
+    const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl );
    float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
    float *Worku = malloc(mWorku * nWorku * sizeof(float));
    LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
    LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);

+
    // Recursive kernel
    RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);

@ -81,6 +82,7 @@ static void RELAPACK_sgbtrf_rec(
    blasint *info
 ) {

+
    if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
        // Unblocked
        LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
@ -155,6 +157,7 @@ static void RELAPACK_sgbtrf_rec(
    float *const A_BRbl = A_BR              + m21;
    float *const A_BRbr = A_BR + *ldA * n21 + m21;

+
    // recursion(Ab_L, ipiv_T)
    RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);

@ -216,8 +219,11 @@ static void RELAPACK_sgbtrf_rec(
        }
    }

+
    // recursion(Ab_BR, ipiv_B)
-    RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+//cause of infinite recursion here ?    
+//      RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+        LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
    if (*info)
        *info += n1;
    // shift pivots
--- a/relapack/src/sgetrf.c
+++ b/relapack/src/sgetrf.c
@ -1,5 +1,4 @@
 #include "relapack.h"
-
 static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *,
    blasint *, blasint *);

@ -22,16 +21,14 @@ void RELAPACK_sgetrf(
        *info = -1;
    else if (*n < 0)
        *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
        *info = -4;
    if (*info) {
        const blasint minfo = -*info;
        LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF"));
        return;
    }
-
    const blasint sn = MIN(*m, *n);
-
    RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);

    // Right remainder
@ -61,7 +58,6 @@ static void RELAPACK_sgetrf_rec(
    float *A, const blasint *ldA, blasint *ipiv,
    blasint *info
 ) {
-
    if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
        // Unblocked
        LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
@ -77,7 +73,6 @@ static void RELAPACK_sgetrf_rec(
    const blasint n1 = SREC_SPLIT(*n);
    const blasint n2 = *n - n1;
    const blasint m2 = *m - n1;
-
    // A_L A_R
    float *const A_L = A;
    float *const A_R = A + *ldA * n1;
--- a/relapack/src/zgbtrf.c
+++ b/relapack/src/zgbtrf.c
@ -56,10 +56,10 @@ void RELAPACK_zgbtrf(

    // Allocate work space
    const blasint n1 = ZREC_SPLIT(*n);
-    const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const blasint nWorkl = (kv > n1) ? n1 : kv;
-    const blasint mWorku = (*kl > n1) ? n1 : *kl;
-    const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv);
+    const blasint nWorkl = abs ( (kv > n1) ? n1 : kv);
+    const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl);
+    const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl);
    double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
    double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
    LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec(
    }

    // recursion(Ab_BR, ipiv_B)
-    RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ //   RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
+ 
    if (*info)
        *info += n1;
    // shift pivots
--- a/relapack/src/zgetrf.c
+++ b/relapack/src/zgetrf.c
@ -22,7 +22,7 @@ void RELAPACK_zgetrf(
        *info = -1;
    else if (*n < 0)
        *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
        *info = -4;
    if (*info) {
        const blasint minfo = -*info;
--- a/test/cblat1.f
+++ b/test/cblat1.f
@ -576,7 +576,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/test/dblat1.f
+++ b/test/dblat1.f
@ -991,7 +991,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/test/sblat1.f
+++ b/test/sblat1.f
@ -946,7 +946,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
--- a/test/zblat1.f
+++ b/test/zblat1.f
@ -576,7 +576,7 @@
      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *