From 7f58f3ad0e10304965a6573bb11208cb6e1df446 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Sep 2019 00:44:26 +0200 Subject: [PATCH 01/11] Fix mis-edits in the gcc-derived power8 caxpy kernel --- kernel/power/caxpy_power8.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index 09a423571..0ce61ca3b 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -34,9 +34,9 @@ caxpy_k: lfs 0,4(10) fmuls 10,2,10 #ifdef CONJ - fmsubs 11,11,1,10 -#else fmadds 11,11,1,10 +#else + fmsubs 11,11,1,10 #endif fadds 12,12,11 stfs 12,0(10) @@ -241,8 +241,13 @@ caxpy_k: lfsx 12,8,5 lfsx 0,10,5 fmuls 11,2,11 +#ifdef CONJ fmsubs 12,1,12,11 fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif stfsx 0,10,5 ble 7,.L39 sldi 6,6,2 From 596a22325a1123bed772c61d298c8d14d187cfe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Sep 2019 00:47:18 +0200 Subject: [PATCH 02/11] Fix prologue of power9 assembly cdot(c) kernel to provide cdotc --- kernel/power/cdot_power9.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S index 01d194c0c..9ec7cdd85 100644 --- a/kernel/power/cdot_power9.S +++ b/kernel/power/cdot_power9.S @@ -1,10 +1,16 @@ - .file "cdot.c" +#define ASSEMBLER +#include "common.h" +/* +.file "cdot.c" .abiversion 2 .section ".text" .align 2 .p2align 4,,15 .globl cdot_k .type cdot_k, @function +*/ + PROLOGUE + cdot_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha From ede5efebabb5dbde46175b996df59c755248bf29 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Sun, 29 Sep 2019 02:27:50 +0000 Subject: [PATCH 03/11] trmm fix --- kernel/power/sgemm_logic_power9.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 053836cbf..a34ed32b8 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -136,8 +136,8 @@ LSGEMM_L8x16_BEGIN: #endif ZERO8x16 - mtctr L ble LSGEMM_L8x16_SUB0 + mtctr L bl LSGEMM_L8x16_LMAIN_SUB andi. L, T12, 127 ble LSGEMM_L8x16_SAVE @@ -146,7 +146,7 @@ LSGEMM_L8x16_BEGIN: LSGEMM_L8x16_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 255 - cmpwi T11,128 + cmpwi T11,129 #else andi. L, K, 255 cmpwi K,129 From 6355c25dde1ccba0fe6521dc0b36c0fcdddda0ef Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Sun, 29 Sep 2019 22:03:12 -0700 Subject: [PATCH 04/11] Avoid taking root of negative number in symv_thread.c This is similar to fixes in gh-1929, but there was one remaining occurance of this type of pattern in the driver/level2/*_thread.c files. --- driver/level2/symv_thread.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index ab783de2b..d7cc01768 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -166,7 +166,11 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i if (nthreads - num_cpu > 1) { double di = (double)i; - width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } if (width < 4) width = 4; if (width > m - i) width = m - i; @@ -212,9 +216,9 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i double di = (double)(m - i); if (di * di - dnum > 0) { - width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { - width = m - i; + width = m - i; } if (width < 4) width = 4; From 8617d75548ae7be8f78406c15f98e218ad89a42a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Oct 2019 23:50:41 +0200 Subject: [PATCH 05/11] Revert "Avoid taking root of negative number in symv_thread.c" --- driver/level2/symv_thread.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index d7cc01768..ab783de2b 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -166,11 +166,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i if (nthreads - num_cpu > 1) { double di = (double)i; - if (di * di - dnum > 0) { - width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; - } else { - width = m - i; - } + width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; if (width < 4) width = 4; if (width > m - i) width = m - i; @@ -216,9 +212,9 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i double di = (double)(m - i); if (di * di - dnum > 0) { - width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { - width = m - i; + width = m - i; } if (width < 4) width = 4; From ac10236cc8a7b61e3fa37741ca903ea4d990a62e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Oct 2019 22:35:34 +0200 Subject: [PATCH 06/11] Update the OSX BINARY=32 test to xcode9.2 in response to Homebrew updates --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2b1b99b26..51c55acf5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -169,7 +169,7 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos - osx_image: xcode8.3 + osx_image: xcode9.2 env: - BTYPE="BINARY=32" From 32f5907fef1b1a68a3af20278c4f3b3b54b5268b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Oct 2019 01:09:02 +0200 Subject: [PATCH 07/11] Update 32bit macOS again to xcode 9.3 os version 10.13 "High Sierra" appears to be the oldest release now for which Homebrew provides a gcc package. Anything older and the Travis job will run out of time building gcc from source --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 51c55acf5..28f95f5e2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -169,7 +169,7 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos - osx_image: xcode9.2 + osx_image: xcode9.3 env: - BTYPE="BINARY=32" From bb5413863fbf52dc5b8f2fd1b814b80c938d8c39 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Oct 2019 14:50:03 +0200 Subject: [PATCH 08/11] Rewrite ARM64 PROLOGUE to make it compatible with xcode/ios --- common_arm64.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index c6ef2fb5d..c5e6948dc 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -103,12 +103,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) -#define PROLOGUE \ - .text ;\ - .align 4 ;\ - .global REALNAME ;\ - .type REALNAME, %function ;\ +.macro PROLOGUE + .text ; + .p2align 2 ; + .global REALNAME ; + .type REALNAME, %function ; REALNAME: +.endm + #define EPILOGUE From 56837e9d92c41290b07bc924915c633e39401abb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Oct 2019 14:53:23 +0200 Subject: [PATCH 09/11] Make local labels in macro compatible with the xcode assembler ... which does not perform the automatic numbering on instantiation that the _@ suffix signifies --- kernel/arm64/nrm2.S | 19 ++++++++++--------- kernel/arm64/znrm2.S | 38 +++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index e2cbd4def..d4f0374cb 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -54,37 +54,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) ldr s4, [X], #4 fcmp s4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ + beq 2f fabs s4, s4 fcmp SCALE, s4 - bge KERNEL_F1_SCALE_GE_X_\@ + bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */ fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_X_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_X_\@: */ fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X], #8 fcmp d4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ fabs d4, d4 fcmp SCALE, d4 - bge KERNEL_F1_SCALE_GE_X_\@ + bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */ fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_X_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_X_\@: */ fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] #endif -KERNEL_F1_NEXT_\@: +2: /* KERNEL_F1_NEXT_\@: */ .endm .macro KERNEL_S1 diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S index 1c89685ea..ce3f7d4ed 100644 --- a/kernel/arm64/znrm2.S +++ b/kernel/arm64/znrm2.S @@ -54,69 +54,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) ldr s4, [X], #4 fcmp s4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ fabs s4, s4 fcmp SCALE, s4 - bge KERNEL_F1_SCALE_GE_XR_\@ + bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */ fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_XR_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_XR_\@: */ fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] -KERNEL_F1_NEXT_\@: +2: /* KERNEL_F1_NEXT_\@: */ ldr s5, [X], #4 fcmp s5, REGZERO - beq KERNEL_F1_END_\@ + beq 4f /* KERNEL_F1_END_\@ */ fabs s5, s5 fcmp SCALE, s5 - bge KERNEL_F1_SCALE_GE_XI_\@ + bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */ fdiv s2, SCALE, s5 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s5 - b KERNEL_F1_END_\@ -KERNEL_F1_SCALE_GE_XI_\@: + b 4f /* KERNEL_F1_END_\@ */ +3: /* KERNEL_F1_SCALE_GE_XI_\@: */ fdiv s2, s5, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X], #8 fcmp d4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ fabs d4, d4 fcmp SCALE, d4 - bge KERNEL_F1_SCALE_GE_XR_\@ + bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */ fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_XR_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_XR_\@: */ fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] -KERNEL_F1_NEXT_\@: +2: /* KERNEL_F1_NEXT_\@: */ ldr d5, [X], #8 fcmp d5, REGZERO - beq KERNEL_F1_END_\@ + beq 4f /* KERNEL_F1_END_\@ */ fabs d5, d5 fcmp SCALE, d5 - bge KERNEL_F1_SCALE_GE_XI_\@ + bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */ fdiv d2, SCALE, d5 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d5 - b KERNEL_F1_END_\@ -KERNEL_F1_SCALE_GE_XI_\@: + b 4f /* KERNEL_F1_END_\@ */ +3: /* KERNEL_F1_SCALE_GE_XI_\@: */ fdiv d2, d5, SCALE fmla SSQ, d2, v2.d[0] #endif -KERNEL_F1_END_\@: +4: /* KERNEL_F1_END_\@: */ .endm .macro KERNEL_S1 From 258ac56e0aa46e9b7120bcb5635d1bba48f4c2aa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Oct 2019 10:52:47 +0200 Subject: [PATCH 10/11] Move 32bit OSX build back to xcode 8.3 but switch to gcc8 --- .travis.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 28f95f5e2..72e29091d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -162,16 +162,16 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - brew install gcc # for gfortran + - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="BINARY=64 INTERFACE64=1" + - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8" - <<: *test-macos - osx_image: xcode9.3 + osx_image: xcode8.3 env: - - BTYPE="BINARY=32" + - BTYPE="BINARY=32 FC=gfortran-8" # whitelist branches: From 3a2df19db6b9bacd88974fbf87ef5b335fa2856f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2019 08:09:26 +0200 Subject: [PATCH 11/11] Fix accidental duplication of jump instruction --- kernel/arm64/nrm2.S | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index d4f0374cb..0e5a8eed1 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -55,7 +55,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s4, [X], #4 fcmp s4, REGZERO beq 2f /* KERNEL_F1_NEXT_\@ */ - beq 2f fabs s4, s4 fcmp SCALE, s4 bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */