From 73478664d4fb01f93d1810e85e1b7a499288b5bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Aug 2018 16:40:32 +0200 Subject: [PATCH 001/236] Add workaround for avx512 compilations on Cygwin fixes #1708 --- Makefile.x86_64 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 677c05d93..f831b5040 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX) ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +endif endif endif From 7932ff3ea9666ab022c20354672b2597c756ee02 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 8 Aug 2018 02:59:11 +0000 Subject: [PATCH 002/236] Add an AVX512 enabled DDOT function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/ddot.c | 4 +- kernel/x86_64/ddot_microk_skylakex-2.c | 96 ++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/ddot_microk_skylakex-2.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0dc9cd3da..969357614 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c new file mode 100644 index 000000000..8eabf225a --- /dev/null +++ b/kernel/x86_64/ddot_microk_skylakex-2.c @@ -0,0 +1,96 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_8 1 + +#include + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + int i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + int n32; + n32 = n & (~31); + + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + for (; i < n32; i += 32) { + accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]); + accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]); + accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]); + accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1); + accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1); + accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1); + accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1); + +#endif + for (; i < n; i += 16) { + accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]); + accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]); + accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]); + accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "ddot_microk_haswell-2.c" +#endif From 33043f563fb6849d4afee45cbcf85d03aa561a4e Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 10 Aug 2018 01:54:18 +0300 Subject: [PATCH 003/236] Disable scal to benchmark zgemv separately by default --- benchmark/gemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index c06e829d9..b6a42f42f 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 0.0}; char trans='N'; blasint m, i, j; blasint inc_x=1,inc_y=1; From 00abaa865bea441f20bb29b35dfb0524f112b34e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 10 Aug 2018 02:31:48 +0000 Subject: [PATCH 004/236] Add an AVX512 enabled SDOT function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/sdot.c | 4 +- kernel/x86_64/sdot_microk_skylakex-2.c | 98 ++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sdot_microk_skylakex-2.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index c3ab2ffe6..3536afc9e 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c new file mode 100644 index 000000000..4740161f4 --- /dev/null +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + +{ + int i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + +#ifdef __AVX512CD__ + __m512 accum_05, accum_15, accum_25, accum_35; + int n64; + n64 = n & (~63); + + accum_05 = _mm512_setzero_ps(); + accum_15 = _mm512_setzero_ps(); + accum_25 = _mm512_setzero_ps(); + accum_35 = _mm512_setzero_ps(); + + for (; i < n64; i += 64) { + accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]); + accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]); + accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]); + accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); + accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); + accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); + accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1)) + +#endif + for (; i < n; i += 32) { + accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]); + accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]); + accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]); + accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "sdot_microk_haswell-2.c" +#endif From 2e99873ff7112b6b35d35cf87eb34762f3f3d38b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 10 Aug 2018 02:58:32 +0000 Subject: [PATCH 005/236] Add a AVX512 enabled SAXPY/DAXPY functions written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/daxpy.c | 4 +- kernel/x86_64/daxpy_microk_skylakex-2.c | 71 +++++++++++++++++++++++++ kernel/x86_64/saxpy.c | 4 +- kernel/x86_64/saxpy_microk_skylakex-2.c | 69 ++++++++++++++++++++++++ 4 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/daxpy_microk_skylakex-2.c create mode 100644 kernel/x86_64/saxpy_microk_skylakex-2.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index b4acdccd2..cde5bdaa6 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c new file mode 100644 index 000000000..e785a39f1 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_skylakex-2.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256d __alpha; + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n32; + __m512d __alpha5; + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + n32 = n & ~31; + + for (; i < n32; i+= 32) { + _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); + _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); + _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); + } + +#endif + + for (; i < n; i+= 16) { + _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); + } +} +#else +#include "daxpy_microk_haswell-2.c" +#endif + + diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89c4070d..e1349da58 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c new file mode 100644 index 000000000..950f10ba2 --- /dev/null +++ b/kernel/x86_64/saxpy_microk_skylakex-2.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256 __alpha; + + __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n64; + __m512 __alpha5; + __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); + + n64 = n & ~63; + + for (; i < n64; i+= 64) { + _mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16])); + _mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32])); + _mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48])); + } + +#endif + + for (; i < n; i+= 32) { + _mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24])); + } +} +#else +#include "saxpy_microk_haswell-2.c" +#endif + From c52a831ae446a4ea9ead4948a2d1ab38034677b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 10 Aug 2018 13:23:47 +0200 Subject: [PATCH 006/236] Add changes from the 0.3.x releases fixes #1727 --- Changelog.txt | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index cb6fee70a..33dcacc51 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,115 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.2 +30-Jul-2018 + +common: + * fixes for regressions caused by the rewrite of the thread + initialization code in 0.3.1 + +POWER: + * fixed cpu autodetection for the BSDs + +MIPS64: + * fixed utest errors in AXPY, DSDOT, ROT and SWAP + +x86_64: + * added autodetection of AMD Ryzen 2 + * fixed build with older versions of MSVC + +==================================================================== +Version 0.3.1 +01-Jul-2018 + +common: + * rewritten thread initialization code with significantly reduced overhead + * added CBLAS interfaces to the IxAMIN BLAS extension functions + * fixed the lapack-test target + * CMAKE builds now create an OpenBLASConfig.cmake file + * ZAXPY now uses a single thread for small input sizes + * the LAPACK code was updated from Reference-LAPACK/lapack#253 + (fixing LAPACKE interfaces to Aasen's functions) + +POWER: + * corrected CROT and ZROT behaviour with zero INC_X + +ARMV7: + * corrected xDOT behaviour with zero INC_X or INC_Y + +x86_64: + * retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER, + this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO + (which will still be supported via the slower PRESCOTT kernels when this option is not set) + * added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to + specify the list of x86_64 targets to include. Any target not on the list will be supported + by the Sandybridge or Nehalem kernels if available, or by Prescott. + * improved SWITCH_RATIO on Haswell for increased GEMM throughput + * added initial support for Intel Skylake X, including an AVX512 SGEMM kernel + * added autodetection of Intel Cannon Lake series as Skylake X + * added a default L2 cache size for hypervisors that return zero here (Chromebook) + * fixed a name clash with recent Windows10 headers that broke the build with (at least) + recent mingw from MSYS2 + * fixed a link error in mixed clang/gfortran builds with OpenMP + * updated the OSX deployment target to 10.8 + * switched on parallel make for builds on MS Windows by default + +x86: + * fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y + +==================================================================== +Version 0.3.0 +23-May-2108 + +common: + * fixed some more thread race and locking bugs + * added preliminary support for calling an OpenMP build of the library from multiple threads + * removed performance impact of thread locks added in 0.2.20 on OpenMP code + * general code cleanup + * optimized DSDOT implementation + * improved thread distribution for GEMM + * corrected IMATCOPY/OMATCOPY implementation + * fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations + * cmake build improvements + * pkgconfig file now contains build options + * openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build + * corrections and improvements for systems with more than 64 cpus + * LAPACK code updated to 3.8.0 including later fixes + * added ReLAPACK, a recursive implementation of several LAPACK functions + * Rewrote ROTMG to handle cases that the netlib code failed to address + * Disabled (broken) multithreading code for xTRMV + * corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard + * shared memory access failures on startup are now handled more gracefully + * restored utests from earlier releases (and made them pass on all affected systems) + +SPARC: + * several fixes for cpu autodetection + +POWER: + * corrected vector register overwriting in several Power8 kernels + * optimized additional BLAS functions + +ARM: + * added support for CortexA53 and A72 + * added autodetection for ThunderX2T99 + * made most optimized kernels the default for generic ARMv8 targets + +x86_64: + * parallelized DDOT kernel for Haswell + * changed alignment directives in assembly kernels to boost performance on OSX + * fixed register handling in the GEMV microkernels (bug exposed by gcc7) + * added support for building on OpenBSD and Dragonfly + * updated compiler options to work with Intel release 2018 + * support fully optimized build with clang/flang on Microsoft Windows + * fixed building on AIX + +IBM Z: + * added optimized BLAS 1/2 functions + +MIPS: + * fixed cpu autodetection helper code + * added mips32 1004K cpu (Mediatek MT7621 and similar SoC) + * added mips64 I6500 cpu + ==================================================================== Version 0.2.20 24-Jul-2017 From cacacc8007eaf8c01ca32f289980ee8b91016b8f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:14:57 +0000 Subject: [PATCH 007/236] Add an AVX512 enabled DSCAL function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/dscal.c | 4 +- kernel/x86_64/dscal_microk_skylakex-2.c | 77 +++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dscal_microk_skylakex-2.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 2c7b3b17c..ef9a0a6ba 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c new file mode 100644 index 000000000..e0598272e --- /dev/null +++ b/kernel/x86_64/dscal_microk_skylakex-2.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + +#ifdef __AVX512CD__ + __m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0])); + } +#else + __m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4])); + } +#endif +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + } +#endif + +} + +#else +#include "dscal_microk_haswell-2.c" +#endif From 36add7570a17c859ed51cb8e016286ce40c09293 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:16:45 +0000 Subject: [PATCH 008/236] Fix typo in sdot function it looks like my previous pull request was short the final commit; fix a typo in sdot --- kernel/x86_64/sdot_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c index 4740161f4..1fcb7f27c 100644 --- a/kernel/x86_64/sdot_microk_skylakex-2.c +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -67,7 +67,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); - accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1)) + accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1); #endif for (; i < n; i += 32) { From 9493f263092d059fcf28f17e621f7396f776db80 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:21:46 +0000 Subject: [PATCH 009/236] add short blurb about avx512 and needed compiler to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 02d087334..9ed9be337 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. +- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. +* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. From 87bebdbd8aacf30741222b722d5f7bde1e51c739 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:38:12 +0000 Subject: [PATCH 010/236] Add an AVX512 enabled DGEMV (n) function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/dgemv_n_4.c | 4 +- kernel/x86_64/dgemv_n_microk_skylakex-4.c | 126 ++++++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemv_n_microk_skylakex-4.c diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 309fbe767..6d2530e81 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) +#include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c new file mode 100644 index 000000000..4030399ab --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_4x4 1 + +#include + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1, x2, x3; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2])); + x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + int n5; + __m512d x05, x15, x25, x35; + __m512d __alpha5; + n5 = n & ~7; + + x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0])); + x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1])); + x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2])); + x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + for (; i < n5; i+= 8) { + __m512d tempY; + __m512d sum; + + sum = _mm512_loadu_pd(&ap[0][i]) * x05 + + _mm512_loadu_pd(&ap[1][i]) * x15 + + _mm512_loadu_pd(&ap[2][i]) * x25 + + _mm512_loadu_pd(&ap[3][i]) * x35; + + tempY = _mm512_loadu_pd(&y[i]); + tempY += sum * __alpha5; + _mm512_storeu_pd(&y[i], tempY); + } +#endif + + for (; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + + _mm256_loadu_pd(&ap[1][i]) * x1 + + _mm256_loadu_pd(&ap[2][i]) * x2 + + _mm256_loadu_pd(&ap[3][i]) * x3; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + + +#define HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + + + for (i = 0; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + +#else +#include "dgemv_n_microk_haswell-4.c" +#endif From 9bec34cb672843a872bf5338518c73bf32414239 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:46:24 +0000 Subject: [PATCH 011/236] Add an AVX512 enabled DSYMV (L) function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/dsymv_L.c | 4 +- kernel/x86_64/dsymv_L_microk_skylakex-2.c | 161 ++++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dsymv_L_microk_skylakex-2.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 73099462c..a722cc9df 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" #elif defined(NEHALEM) diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c new file mode 100644 index 000000000..8244dffa1 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_4x4 1 + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __m256d accum_0, accum_1, accum_2, accum_3; + __m256d temp1_0, temp1_1, temp1_2, temp1_3; + + /* the 256 bit wide acculmulator vectors start out as zero */ + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + __m512d temp1_05, temp1_15, temp1_25, temp1_35; + BLASLONG to2; + int delta; + + /* the 512 bit wide accumulator vectors start out as zero */ + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3])); + + delta = (to - from) & ~7; + to2 = from + delta; + + + for (; from < to2; from += 8) { + __m512d _x, _y; + __m512d a0, a1, a2, a3; + + _y = _mm512_loadu_pd(&y[from]); + _x = _mm512_loadu_pd(&x[from]); + + a0 = _mm512_loadu_pd(&a[0][from]); + a1 = _mm512_loadu_pd(&a[1][from]); + a2 = _mm512_loadu_pd(&a[2][from]); + a3 = _mm512_loadu_pd(&a[3][from]); + + _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; + + accum_05 += _x * a0; + accum_15 += _x * a1; + accum_25 += _x * a2; + accum_35 += _x * a3; + + _mm512_storeu_pd(&y[from], _y); + + }; + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); + accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); + accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); + accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); + +#endif + + for (; from != to; from += 4) { + __m256d _x, _y; + __m256d a0, a1, a2, a3; + + _y = _mm256_loadu_pd(&y[from]); + _x = _mm256_loadu_pd(&x[from]); + + /* load 4 rows of matrix data */ + a0 = _mm256_loadu_pd(&a[0][from]); + a1 = _mm256_loadu_pd(&a[1][from]); + a2 = _mm256_loadu_pd(&a[2][from]); + a3 = _mm256_loadu_pd(&a[3][from]); + + _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; + + accum_0 += _x * a0; + accum_1 += _x * a1; + accum_2 += _x * a2; + accum_3 += _x * a3; + + _mm256_storeu_pd(&y[from], _y); + + }; + + /* + * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2 + * output array. There is no direct instruction for this in 256 bit space, only in 128 space. + */ + + __m128d half_accum0, half_accum1, half_accum2, half_accum3; + + + /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1)); + half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1)); + half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + half_accum1 = _mm_hadd_pd(half_accum1, half_accum1); + half_accum2 = _mm_hadd_pd(half_accum2, half_accum2); + half_accum3 = _mm_hadd_pd(half_accum3, half_accum3); + + /* and store the lowest double value from each of these vectors in the temp2 output */ + temp2[0] += half_accum0[0]; + temp2[1] += half_accum1[0]; + temp2[2] += half_accum2[0]; + temp2[3] += half_accum3[0]; +} +#else +#include "dsymv_L_microk_haswell-2.c" +#endif \ No newline at end of file From 5c6e020f4951ee572a0c875c23d75b6e8a3b3567 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dan=20Hor=C3=A1k?= Date: Tue, 14 Aug 2018 12:30:38 +0200 Subject: [PATCH 012/236] detect z14 arch on s390x --- cpuid_zarch.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..e0d9221f3 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,15 +29,18 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) @@ -62,6 +65,10 @@ int detect(void) if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13; + /* detect z14, but fall back to z13 */ + if (strstr(p, "3906")) return CPU_Z13; + if (strstr(p, "3907")) return CPU_Z13; + return CPU_GENERIC; } @@ -107,5 +114,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } From fd42ca462d2df0eece73b26865fa55f7bfa07e53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 19:35:16 +0200 Subject: [PATCH 013/236] Combo of default pre-0.3.1 memory.c and band-aided version of PR1739 --- driver/others/memory.c | 1723 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 1605 insertions(+), 118 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..6bca1e11f 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -72,6 +72,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //#undef DEBUG #include "common.h" + +#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || __GLIBC_PREREQ(2,20)) +#warning "using tls version of memory.c" #include #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) @@ -108,6 +111,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#ifdef OS_HAIKU +#include +#endif + #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include @@ -139,14 +146,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif -#ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP -#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) -#else -#define BUFFERS_PER_THREAD NUM_BUFFERS -#endif -#endif - #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -238,6 +237,14 @@ int get_num_procs(void) { } #endif +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -363,7 +370,7 @@ int blas_get_cpu_number(void){ #endif // blas_goto_num = 0; -#ifndef USE_OPENMP +#ifndef USE_OPENMP_UNUSED blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -420,10 +427,8 @@ int openblas_get_num_threads(void) { int hugetlb_allocated = 0; #if defined(OS_WINDOWS) -#define THREAD_LOCAL __declspec(thread) #define LIKELY_ONE(x) (x) #else -#define THREAD_LOCAL __thread #define LIKELY_ONE(x) (__builtin_expect(x, 1)) #endif @@ -459,62 +464,15 @@ struct alloc_t { for an auxiliary tracking structure. */ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); -/* Clang supports TLS from version 2.8 */ -#if defined(__clang__) && __clang_major__ > 2 || \ - (__clang_minor__ == 2 || __clang_minor__ == 8) -#define HAS_COMPILER_TLS -#endif - -/* GCC supports TLS from version 4.1 */ -#if !defined(__clang__) && defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) -#define HAS_COMPILER_TLS -#endif - -/* MSVC supports TLS from version 2005 */ -#if defined(_MSC_VER) && _MSC_VER >= 1400 -#define HAS_COMPILER_TLS -#endif - -/* Versions of XCode before 8 did not properly support TLS */ -#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 -#undef HAS_COMPILER_TLS -#endif - -/* Android NDK's before version 12b did not support TLS */ -#if defined(__ANDROID__) && defined(__clang__) -#if __has_include() -#include -#endif -#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ - defined(__NDK_MINOR__) && \ - ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) -#undef HAS_COMPILER_TLS -#endif -#endif - -/* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP) -/* This is the number of threads than can be spawned by the server, which is the - server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 -static int next_memory_table_pos = 0; -# if defined(HAS_COMPILER_TLS) -/* Use compiler generated thread-local-storage */ -static int THREAD_LOCAL local_memory_table_pos = 0; +#if defined(SMP) +# if defined(OS_WINDOWS) +static DWORD local_storage_key = 0; +DWORD lsk; # else -/* Use system-dependent thread-local-storage */ -# if defined(OS_WINDOWS) -static DWORD local_storage_key; -# else -static pthread_key_t local_storage_key; -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#else -/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ -# define MAX_ALLOCATING_THREADS 1 -#endif /* defined(SMP) && !defined(USE_OPENMP) */ -static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; +static pthread_key_t local_storage_key = 0; +pthread_key_t lsk; +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -530,34 +488,54 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t key_lock = 0; +#else +static BLASULONG key_lock = 0UL; +#endif + /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP) -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); -# else - int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ - if (!local_memory_table_pos) { - LOCK_COMMAND(&alloc_lock); - local_memory_table_pos = next_memory_table_pos++; - if (next_memory_table_pos > MAX_ALLOCATING_THREADS) - printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); - UNLOCK_COMMAND(&alloc_lock); -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); -# else - pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ +#if defined(SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (!lsk) { + blas_memory_init(); } - return local_memory_table[local_memory_table_pos]; +# if defined(OS_WINDOWS) + struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key); +# else + struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ #else - return local_memory_table[0]; -#endif /* defined(SMP) && !defined(USE_OPENMP) */ + static struct alloc_t ** local_memory_table = NULL; +#endif /* defined(SMP) */ +#if defined (SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (lsk && !local_memory_table) { +#else + if (!local_memory_table) { +#endif /* defined(SMP) */ + local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS); + memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS); +#if defined(SMP) +# if defined(OS_WINDOWS) +LOCK_COMMAND(&key_lock); + TlsSetValue(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# else +LOCK_COMMAND(&key_lock); + pthread_setspecific(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ + } + return local_memory_table; } #ifdef ALLOC_MMAP @@ -637,7 +615,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { static void *alloc_mmap(void *address){ void *map_address, *best_address; - BLASULONG best, start, current; + BLASULONG best, start, current, original; BLASULONG allocsize; if (address){ @@ -685,8 +663,9 @@ static void *alloc_mmap(void *address){ start = (BLASULONG)map_address; current = (SCALING - 1) * allocation_block_size; + original = current; - while(current > 0) { + while(current > 0 && current <= original) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; @@ -1056,18 +1035,29 @@ static volatile int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ + static void blas_memory_cleanup(void* ptr){ + if (ptr) { + struct alloc_t ** table = (struct alloc_t **)ptr; + int pos; + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + struct alloc_t *alloc_info = table[pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + table[pos] = (void *)0; + } + } + free(table); + } +} + static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP) - next_memory_table_pos = 0; -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - local_storage_key = ::TlsAlloc(); -# else - pthread_key_create(&local_storage_key, NULL); -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#endif /* defined(SMP) && !defined(USE_OPENMP) */ - memset(local_memory_table, 0, sizeof(local_memory_table)); +#if defined(SMP) +# if defined(OS_WINDOWS) + local_storage_key = TlsAlloc(); +# else + pthread_key_create(&local_storage_key, blas_memory_cleanup); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ } void *blas_memory_alloc(int procpos){ @@ -1105,7 +1095,16 @@ void *blas_memory_alloc(int procpos){ struct alloc_t * alloc_info; struct alloc_t ** alloc_table; + +#if defined(SMP) && !defined(USE_OPENMP) +int mi; +LOCK_COMMAND(&alloc_lock); +mi=memory_initialized; +UNLOCK_COMMAND(&alloc_lock); + if (!LIKELY_ONE(mi)) { +#else if (!LIKELY_ONE(memory_initialized)) { +#endif #if defined(SMP) && !defined(USE_OPENMP) /* Only allow a single thread to initialize memory system */ LOCK_COMMAND(&alloc_lock); @@ -1149,7 +1148,7 @@ void *blas_memory_alloc(int procpos){ if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; - } while (position < BUFFERS_PER_THREAD); + } while (position < NUM_BUFFERS); goto error; @@ -1247,7 +1246,7 @@ void blas_memory_free(void *buffer){ #ifdef DEBUG alloc_table = get_memory_table(); - for (position = 0; position < BUFFERS_PER_THREAD; position++){ + for (position = 0; position < NUM_BUFFERS; position++){ if (alloc_table[position]) { printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); } @@ -1267,22 +1266,14 @@ void blas_memory_free_nolock(void * map_address) { } void blas_shutdown(void){ - - int pos, thread; - #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - - for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - struct alloc_t *alloc_info = local_memory_table[thread][pos]; - if (alloc_info) { - alloc_info->release_func(alloc_info); - alloc_info = (void *)0; - } - } - } +#ifdef SMP + /* Only cleanupIf we were built for threading and TLS was initialized */ + if (local_storage_key) +#endif + blas_memory_cleanup((void*)get_memory_table()); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1492,6 +1483,1500 @@ void DESTRUCTOR gotoblas_quit(void) { #endif } +#if defined(_MSC_VER) && !defined(__clang__) +BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +{ + switch (ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: +#if defined(SMP) + blas_memory_cleanup((void*)get_memory_table()); +#endif + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + default: + break; + } + return TRUE; +} + +/* + This is to allow static linking. + Code adapted from Google performance tools: + https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc + Reference: + https://sourceware.org/ml/pthreads-win32/2008/msg00028.html + http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp +*/ +static int on_process_term(void) +{ + gotoblas_quit(); + return 0; +} +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XLB") +#else +#pragma data_seg(".CRT$XLB") +#endif +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XTU") +#else +#pragma data_seg(".CRT$XTU") +#endif +static int(*p_process_term)(void) = on_process_term; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif +#endif + +#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) +/* Don't call me; this is just work around for PGI / Sun bug */ +void gotoblas_dummy_for_PGI(void) { + + gotoblas_init(); + gotoblas_quit(); + +#if 0 + asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); + asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); +#else + asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); + asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); +#endif +} +#endif + +#else +#include + +#ifdef OS_WINDOWS +#define ALLOC_WINDOWS +#ifndef MEM_LARGE_PAGES +#define MEM_LARGE_PAGES 0x20000000 +#endif +#else +#define ALLOC_MMAP +#define ALLOC_MALLOC +#endif + +#include +#include +#include + +#ifndef OS_WINDOWS +#include +#ifndef NO_SYSV_IPC +#include +#endif +#include +#endif + +#include + +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#include +#include +#endif + +#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#include +#include +#endif + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef OS_LINUX + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#endif + +#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP) +#define NO_WARMUP +#endif + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#ifndef FIXED_PAGESIZE +#define FIXED_PAGESIZE 4096 +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#if defined(_MSC_VER) && !defined(__clang__) +#define CONSTRUCTOR __cdecl +#define DESTRUCTOR __cdecl +#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) +#else +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) +#endif + +#ifdef DYNAMIC_ARCH +gotoblas_t *gotoblas = NULL; +#endif +extern void openblas_warning(int verbose, const char * msg); + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + static int nums = 0; +cpu_set_t *cpusetp; +size_t size; +int ret; +int i,n; + + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); +#if !defined(OS_LINUX) + return nums; +#endif + +#if !defined(__GLIBC_PREREQ) + return nums; +#else + #if !__GLIBC_PREREQ(2, 3) + return nums; + #endif + + #if !__GLIBC_PREREQ(2, 7) + ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + if (ret!=0) return nums; + n=0; + #if !__GLIBC_PREREQ(2, 6) + for (i=0;i 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + +int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else + // init blas_cpu_number if needed + blas_get_cpu_number(); + return blas_cpu_number; +#endif +} + +struct release_t { + void *address; + void (*func)(struct release_t *); + long attr; +}; + +int hugetlb_allocated = 0; + +static struct release_t release_info[NUM_BUFFERS]; +static int release_pos = 0; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) +static int hot_alloc = 0; +#endif + +/* Global lock for memory allocation */ + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t alloc_lock = 0; +#else +static BLASULONG alloc_lock = 0UL; +#endif + +#ifdef ALLOC_MMAP + +static void alloc_mmap_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : munmap failed\n"); + } +} + + + +#ifdef NO_WARMUP + +static void *alloc_mmap(void *address){ + void *map_address; + + if (address){ + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + } else { + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + } + + if (map_address != (void *)-1) { + LOCK_COMMAND(&alloc_lock); + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + UNLOCK_COMMAND(&alloc_lock); + } + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + return map_address; +} + +#else + +#define BENCH_ITERATION 4 +#define SCALING 2 + +static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { + + BLASULONG original, *p; + BLASULONG start, stop, min; + int iter, i, count; + + min = (BLASULONG)-1; + + original = *(BLASULONG *)(address + size - PAGESIZE); + + *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address; + + for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { + + p = (BLASULONG *)address; + + count = size / PAGESIZE; + + start = rpcc(); + + for (i = 0; i < count; i ++) { + p = (BLASULONG *)(*p); + } + + stop = rpcc(); + + if (min > stop - start) min = stop - start; + } + + *(BLASULONG *)(address + size - PAGESIZE + 0) = original; + *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; + + return min; +} + +static void *alloc_mmap(void *address){ + void *map_address, *best_address; + BLASULONG best, start, current; + BLASULONG allocsize; + + if (address){ + /* Just give up use advanced operation */ + map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc == 0) { + map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#endif + + map_address = mmap(NULL, BUFFER_SIZE * SCALING, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + + if (map_address != (void *)-1) { + +#ifdef OS_LINUX +#ifdef DEBUG + int ret=0; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } + +#else + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif +#endif + + + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; + + while(current > 0) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } + + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + + start = (BLASULONG)map_address; + + best = (BLASULONG)-1; + best_address = map_address; + + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + + current = run_bench(start, allocsize); + + if (best > current) { + best = current; + best_address = (void *)start; + } + + start += PAGESIZE; + + } + + if ((BLASULONG)best_address > (BLASULONG)map_address) + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + + munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + + map_address = best_address; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + hot_alloc = 2; +#endif + } + } +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + LOCK_COMMAND(&alloc_lock); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + UNLOCK_COMMAND(&alloc_lock); + + return map_address; +} + +#endif + +#endif + + +#ifdef ALLOC_MALLOC + +static void alloc_malloc_free(struct release_t *release){ + + free(release -> address); + +} + +static void *alloc_malloc(void *address){ + + void *map_address; + + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_malloc_free; + release_pos ++; + } + + return map_address; + +} + +#endif + +#ifdef ALLOC_QALLOC + +void *qalloc(int flags, size_t bytes); +void *qfree (void *address); + +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 + +static void alloc_qalloc_free(struct release_t *release){ + + qfree(release -> address); + +} + +static void *alloc_qalloc(void *address){ + void *map_address; + + map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_qalloc_free; + release_pos ++; + } + + return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); +} + +#endif + +#ifdef ALLOC_WINDOWS + +static void alloc_windows_free(struct release_t *release){ + + VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + +} + +static void *alloc_windows(void *address){ + void *map_address; + + map_address = VirtualAlloc(address, + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_windows_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_DEVICEDRIVER +#ifndef DEVICEDRIVER_NAME +#define DEVICEDRIVER_NAME "/dev/mapper" +#endif + +static void alloc_devicedirver_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : Bugphysarea unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : Bugphysarea close failed.\n"); + } + +} + +static void *alloc_devicedirver(void *address){ + + int fd; + void *map_address; + + if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) { + + return (void *)-1; + + } + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_devicedirver_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_SHM + +static void alloc_shm_free(struct release_t *release){ + + if (shmdt(release -> address)) { + printf("OpenBLAS : Shared memory unmap failed.\n"); + } +} + +static void *alloc_shm(void *address){ + void *map_address; + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + + map_address = (void *)shmat(shmid, address, 0); + + if (map_address != (void *)-1){ + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + shmctl(shmid, IPC_RMID, 0); + + release_info[release_pos].address = map_address; + release_info[release_pos].attr = shmid; + release_info[release_pos].func = alloc_shm_free; + release_pos ++; + } + + return map_address; +} + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + +static void alloc_hugetlb_free(struct release_t *release){ + +#if defined(OS_LINUX) || defined(OS_AIX) + if (shmdt(release -> address)) { + printf("OpenBLAS : Hugepage unmap failed.\n"); + } +#endif + +#ifdef __sun__ + + munmap(release -> address, BUFFER_SIZE); + +#endif + +#ifdef OS_WINDOWS + + VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + +#endif + +} + +static void *alloc_hugetlb(void *address){ + + void *map_address = (void *)-1; + +#if defined(OS_LINUX) || defined(OS_AIX) + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, +#ifdef OS_LINUX + SHM_HUGETLB | +#endif +#ifdef OS_AIX + SHM_LGPAGE | SHM_PIN | +#endif + IPC_CREAT | SHM_R | SHM_W); + + if (shmid != -1) { + map_address = (void *)shmat(shmid, address, SHM_RND); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + if (map_address != (void *)-1){ + shmctl(shmid, IPC_RMID, 0); + } + } +#endif + +#ifdef __sun__ + struct memcntl_mha mha; + + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; + mha.mha_flags = 0; + mha.mha_pagesize = HUGE_PAGESIZE; + memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); + + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); +#endif + +#ifdef OS_WINDOWS + + HANDLE hToken; + TOKEN_PRIVILEGES tp; + + if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + map_address = (void *)VirtualAlloc(address, + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + tp.Privileges[0].Attributes = 0; + AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); + + if (map_address == (void *)NULL) map_address = (void *)-1; + +#endif + + if (map_address != (void *)-1){ + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_hugetlb_free; + release_pos ++; + } + + return map_address; +} +#endif + +#endif + +#ifdef ALLOC_HUGETLBFILE + +static int hugetlb_pid = 0; + +static void alloc_hugetlbfile_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : HugeTLBfs unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : HugeTLBfs close failed.\n"); + } +} + +static void *alloc_hugetlbfile(void *address){ + + void *map_address = (void *)-1; + int fd; + char filename[64]; + + if (!hugetlb_pid) hugetlb_pid = getpid(); + + sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid); + + if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) { + return (void *)-1; + } + + unlink(filename); + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_hugetlbfile_free; + release_pos ++; + } + + return map_address; +} +#endif + + +#ifdef SEEK_ADDRESS +static BLASULONG base_address = 0UL; +#else +static BLASULONG base_address = BASE_ADDRESS; +#endif + +static volatile struct { + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif + +} memory[NUM_BUFFERS]; + +static int memory_initialized = 0; + +/* Memory allocation routine */ +/* procpos ... indicates where it comes from */ +/* 0 : Level 3 functions */ +/* 1 : Level 2 functions */ +/* 2 : Thread */ + +void *blas_memory_alloc(int procpos){ + + int position; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int mypos; +#endif + + void *map_address; + + void *(*memoryalloc[])(void *address) = { +#ifdef ALLOC_DEVICEDRIVER + alloc_devicedirver, +#endif +/* Hugetlb implicitly assumes ALLOC_SHM */ +#ifdef ALLOC_SHM + alloc_shm, +#endif +#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) + alloc_hugetlb, +#endif +#ifdef ALLOC_MMAP + alloc_mmap, +#endif +#ifdef ALLOC_QALLOC + alloc_qalloc, +#endif +#ifdef ALLOC_WINDOWS + alloc_windows, +#endif +#ifdef ALLOC_MALLOC + alloc_malloc, +#endif + NULL, + }; + void *(**func)(void *address); + LOCK_COMMAND(&alloc_lock); + + if (!memory_initialized) { + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + for (position = 0; position < NUM_BUFFERS; position ++){ + memory[position].addr = (void *)0; + memory[position].pos = -1; + memory[position].used = 0; + memory[position].lock = 0; + } +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#ifdef SMP + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#ifndef DYNAMIC_ARCH + blas_set_parameter(); +#endif +#endif + + memory_initialized = 1; + + } + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Alloc Start ...\n"); +#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + mypos = WhereAmI(); + + position = mypos; + while (position >= NUM_BUFFERS) position >>= 1; + + do { + if (!memory[position].used && (memory[position].pos == mypos)) { + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + } + + position ++; + + } while (position < NUM_BUFFERS); + + +#endif + + position = 0; + + do { +/* if (!memory[position].used) { */ + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ +/* } */ + + position ++; + + } while (position < NUM_BUFFERS); + + goto error; + + allocation : + +#ifdef DEBUG + printf(" Position -> %d\n", position); +#endif + + memory[position].used = 1; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + + if (!memory[position].addr) { + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + + LOCK_COMMAND(&alloc_lock); + memory[position].addr = map_address; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); +#endif + } + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (memory[position].pos == -1) memory[position].pos = mypos; + +#endif + +#ifdef DYNAMIC_ARCH + + if (memory_initialized == 1) { + + LOCK_COMMAND(&alloc_lock); + + if (memory_initialized == 1) { + + if (!gotoblas) gotoblas_dynamic_init(); + + memory_initialized = 2; + } + + UNLOCK_COMMAND(&alloc_lock); + + } +#endif + + +#ifdef DEBUG + printf("Mapped : %p %3d\n\n", + (void *)memory[position].addr, position); +#endif + + return (void *)memory[position].addr; + + error: + printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + + return NULL; +} + +void blas_memory_free(void *free_area){ + + int position; + +#ifdef DEBUG + printf("Unmapped Start : %p ...\n", free_area); +#endif + + position = 0; + LOCK_COMMAND(&alloc_lock); + + while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + position++; + + if (memory[position].addr != free_area) goto error; + +#ifdef DEBUG + printf(" Position : %d\n", position); +#endif + + // arm: ensure all writes are finished before other thread takes this memory + WMB; + + memory[position].used = 0; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Unmap Succeeded.\n\n"); +#endif + + return; + + error: + printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); + +#ifdef DEBUG + for (position = 0; position < NUM_BUFFERS; position++) + printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); +#endif + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +void *blas_memory_alloc_nolock(int unused) { + void *map_address; + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + return map_address; +} + +void blas_memory_free_nolock(void * map_address) { + free(map_address); +} + +void blas_shutdown(void){ + + int pos; + +#ifdef SMP + BLASFUNC(blas_thread_shutdown)(); +#endif + + LOCK_COMMAND(&alloc_lock); + + for (pos = 0; pos < release_pos; pos ++) { + release_info[pos].func(&release_info[pos]); + } + +#ifdef SEEK_ADDRESS + base_address = 0UL; +#else + base_address = BASE_ADDRESS; +#endif + + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + memory[pos].addr = (void *)0; + memory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + memory[pos].pos = -1; +#endif + memory[pos].lock = 0; + } + + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + +#ifdef SMP +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t init_lock = 0; +#else +static BLASULONG init_lock = 0UL; +#endif +#endif + +static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + void *sa, void *sb, BLASLONG pos) { + +#if !defined(ARCH_POWER) && !defined(ARCH_SPARC) + + size_t size; + BLASULONG buffer; + + size = BUFFER_SIZE - PAGESIZE; + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc != 2) { +#endif + +#ifdef SMP + LOCK_COMMAND(&init_lock); +#endif + + while (size > 0) { + *(int *)buffer = size; + buffer += PAGESIZE; + size -= PAGESIZE; + } + +#ifdef SMP + UNLOCK_COMMAND(&init_lock); +#endif + + size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + + while (size > 0) { + *(int *)buffer = size; + buffer += 64; + size -= 64; + } + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + +#endif +} + +#ifdef SMP + +static void _init_thread_memory(void *buffer) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int num_cpu; + + for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) { + + blas_queue_init(&queue[num_cpu]); + queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL; + queue[num_cpu].routine = &_touch_memory; + queue[num_cpu].args = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + } + + queue[num_cpu - 1].next = NULL; + queue[0].sa = buffer; + + exec_blas(num_cpu, queue); + +} +#endif + +static void gotoblas_memory_init(void) { + + void *buffer; + + hot_alloc = 1; + + buffer = (void *)blas_memory_alloc(0); + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif + + _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); + +#else + + _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); + +#endif + + blas_memory_free(buffer); +} +#endif + +/* Initialization for all function; this function should be called before main */ + +static int gotoblas_initialized = 0; +extern void openblas_read_env(); + +void CONSTRUCTOR gotoblas_init(void) { + + if (gotoblas_initialized) return; + +#ifdef SMP + openblas_fork_handler(); +#endif + + openblas_read_env(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + gotoblas_memory_init(); +#endif + +//#if defined(OS_LINUX) +#if 0 + struct rlimit curlimit; + if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) + { + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } + } +#endif + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_init(); +#endif + + gotoblas_initialized = 1; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +void DESTRUCTOR gotoblas_quit(void) { + + if (gotoblas_initialized == 0) return; + + blas_shutdown(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_quit(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_quit(); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_quit(); +#endif + + gotoblas_initialized = 0; + +#ifdef PROFILE + moncontrol (1); +#endif +} + #if defined(_MSC_VER) && !defined(__clang__) BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) { @@ -1573,3 +3058,5 @@ void gotoblas_dummy_for_PGI(void) { #endif } #endif + +#endif From 2a589c4b286b4ab2f117efdc501d2facc547a401 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 19:36:12 +0200 Subject: [PATCH 014/236] Add USE_TLS option to switch between old and new memory.c --- cmake/system.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 48e8f75bc..18b2c3b87 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR) set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") endif () +if (USE_TLS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS") +endif () + # Only for development # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") From 2caa2210bbfb5b69c3758b8158bb0bad4a0f5e58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 19:37:11 +0200 Subject: [PATCH 015/236] Add USE_TLS option to choose between old and new implementation of memory.c --- Makefile.rule | 10 ++++++++-- Makefile.system | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 649aabe70..4b815d7a8 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.1.dev +VERSION = 0.3.3.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 # BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. -# USE_SIMPLE_THREADED_LEVEL3 = 1 +USE_SIMPLE_THREADED_LEVEL3 = 1 + +# If you want to use the new, still somewhat experimental code that uses +# thread-local storage instead of a central memory buffer in memory.c +# Note that if your system uses GLIBC, it needs to have at least glibc 2.21 +# for this to work. +USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran # compiler supports this. It's safe to keep comment it out if you diff --git a/Makefile.system b/Makefile.system index 4712d9525..2123af204 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif +ifdef USE_TLS +CCOMMON_OPT += -DUSE_TLS +endif + ifndef SYMBOLPREFIX SYMBOLPREFIX = endif From 5991d1a6cd9d7340d2ea7e393a00eab8e232394f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 22:12:40 +0200 Subject: [PATCH 016/236] Update memory.c --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1d408fcda..7688937e5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || __GLIBC_PREREQ(2,20)) +#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || (defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2,20))) #warning "using tls version of memory.c" #include From b902a409863f14e3334ae79265fa353f21f98ed7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 26 Aug 2018 11:18:02 +0200 Subject: [PATCH 017/236] Rewrite glibc version check --- driver/others/memory.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 7688937e5..b2e154e8b 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,8 +73,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || (defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2,20))) -#warning "using tls version of memory.c" +#if defined(USE_TLS) +#define COMPILE_TLS +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2,20)) +#undef COMPILE_TLS +#endif +#endif +#endif + +#if defined(COMPILE_TLS) + #include #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) From b55690a659fbc1b9cd267da26e2e54e3bdf7be52 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 26 Aug 2018 11:31:07 +0200 Subject: [PATCH 018/236] typo fix --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index b2e154e8b..9d4ab19f5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(USE_TLS) #define COMPILE_TLS #if defined(__GLIBC_PREREQ) -#if !__GLIBC_PREREQ(2,20)) +#if !__GLIBC_PREREQ(2,20) #undef COMPILE_TLS #endif #endif From 9e917b16dbba25c013b3fa32d22476eb4ed15541 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Aug 2018 21:11:54 +0200 Subject: [PATCH 019/236] Fix missing replacements of ILAENV by ILAENV_2STAGE (lapack PR 272) This could cause spurious "parameter has an illegal value" errors in DSYEVR and related routines, see https://github.com/Reference-LAPACK/lapack/issues/262 --- lapack-netlib/SRC/chetrd_hb2st.F | 10 +++++----- lapack-netlib/SRC/chetrd_he2hb.f | 6 +++--- lapack-netlib/SRC/dsytrd_sb2st.F | 10 +++++----- lapack-netlib/SRC/dsytrd_sy2sb.f | 6 +++--- lapack-netlib/SRC/ssytrd_sb2st.F | 10 +++++----- lapack-netlib/SRC/ssytrd_sy2sb.f | 6 +++--- lapack-netlib/SRC/zhetrd_hb2st.F | 10 +++++----- lapack-netlib/SRC/zhetrd_he2hb.f | 6 +++--- 8 files changed, 32 insertions(+), 32 deletions(-) diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 91806bb1d..43da45640 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -280,8 +280,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -297,9 +297,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index fd8c3fbe0..e334532fe 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index 4ca0507e4..4d81fe226 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -277,8 +277,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -294,9 +294,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 85337f792..e0a5debc5 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index bd645327e..0df1173e4 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -277,8 +277,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -294,9 +294,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index c01fe3598..272876700 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 508afca06..86122cccc 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -280,8 +280,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -297,9 +297,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e35578b42..e33bf4b2b 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 From f3fd44a731c1997b1d79d4d16abc25d78dce88a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Aug 2018 21:34:07 +0200 Subject: [PATCH 020/236] Set USE_TRMM for all ZARCH variants to fix TRMM faults with zarch-generic fixes #1743 --- kernel/Makefile.L3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index b37e536ef..9258f216d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8) USE_TRMM = 1 endif -ifeq ($(CORE), Z13) +ifeq ($(ARCH), zarch) USE_TRMM = 1 endif From e17f969fa0f7e8c9f5525577198a17fd7a9da21a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Aug 2018 13:28:46 +0200 Subject: [PATCH 021/236] Assume cross-compilation if host and target os differ fixes 1674 --- c_check | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/c_check b/c_check index 3831d7aa3..64009504c 100644 --- a/c_check +++ b/c_check @@ -223,7 +223,6 @@ $data =~ /globl\s([_\.]*)(.*)/; $need_fu = $1; $cross = 0; -$cross = 1 if ($os ne $hostos); if ($architecture ne $hostarch) { $cross = 1; @@ -231,6 +230,8 @@ if ($architecture ne $hostarch) { $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); } +$cross = 1 if ($os ne $hostos); + $openmp = "" if $ENV{USE_OPENMP} != 1; $linker_L = ""; From 3197f86762f14753517dfebd7f8665cb6bf6c344 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Aug 2018 23:43:14 +0200 Subject: [PATCH 022/236] Version 0.3.3 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20ce02e87..0f985455b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 3.dev) +set(OpenBLAS_PATCH_VERSION 3) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From f0563f14bab6afcb3263a4710087c704bddfbb98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Aug 2018 23:43:57 +0200 Subject: [PATCH 023/236] Version 0.3.3 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 4b815d7a8..6457532c8 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.3.dev +VERSION = 0.3.3 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From fd8d1868a126bb9f12bbc43b36ee30d1ba943fbb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:07:48 +0200 Subject: [PATCH 024/236] Updates for 0.3.3 --- Changelog.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 33dcacc51..faecd82e3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,31 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.3 +31-Aug-2018 + +common: + * thread memory allocation has been switched back to the method + used before version 0.3.1 due to unexpected problems caused by + the new code under some circumstances. A new compile-time option + USE_TLS has been added to enable the new code, and it is hoped + that this can become the default again in the next version. + * LAPAck PR272 has been integrated, which fixes spurious errors + in DSYEVR and related functions caused by missing conversion + from ILAENV to ILAENV_2STAGE in several _2stage routines. + * the cmake-generated OpenBLASConfig.cmake now uses correct case + for the name of the library + * added support for Haiku OS + +x86_64: + * added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY, + DSCAL, DGEMVN and DSYMVL + * added a workaround for a cygwin issue that prevented compilation + of AVX512 code + +IBM Z: + * added autodetection of Z14 + * fixed TRMM errors in the generic target + ==================================================================== Version 0.3.2 30-Jul-2018 From 2982ce505d35bde04013b3e1cf4755954901efe5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:18:37 +0200 Subject: [PATCH 025/236] Update version to 0.3.4.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20ce02e87..97c3b7777 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 3.dev) +set(OpenBLAS_PATCH_VERSION 4.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From dbfd7524cd94fe15930ed2f78b7789f15b22fec0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:19:21 +0200 Subject: [PATCH 026/236] Update version to 0.3.4.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 4b815d7a8..25ed0357d 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.3.dev +VERSION = 0.3.4.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 9e2bb0c6417ade4a9cf4a5787e0eb9fd491e8fc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:21:13 +0200 Subject: [PATCH 027/236] Update with the changes from 0.3.3 --- Changelog.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 33dcacc51..faecd82e3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,31 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.3 +31-Aug-2018 + +common: + * thread memory allocation has been switched back to the method + used before version 0.3.1 due to unexpected problems caused by + the new code under some circumstances. A new compile-time option + USE_TLS has been added to enable the new code, and it is hoped + that this can become the default again in the next version. + * LAPAck PR272 has been integrated, which fixes spurious errors + in DSYEVR and related functions caused by missing conversion + from ILAENV to ILAENV_2STAGE in several _2stage routines. + * the cmake-generated OpenBLASConfig.cmake now uses correct case + for the name of the library + * added support for Haiku OS + +x86_64: + * added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY, + DSCAL, DGEMVN and DSYMVL + * added a workaround for a cygwin issue that prevented compilation + of AVX512 code + +IBM Z: + * added autodetection of Z14 + * fixed TRMM errors in the generic target + ==================================================================== Version 0.3.2 30-Jul-2018 From a4bd41e9f2bbebfe2453de7a43194b185fd72da5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 4 Sep 2018 10:51:19 +0200 Subject: [PATCH 028/236] Fix paths to C kernels for nrm2 --- kernel/arm64/KERNEL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index aeccfbf4c..f936cdf47 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -1,17 +1,17 @@ ifndef SNRM2KERNEL -SNRM2KERNEL = nrm2.c +SNRM2KERNEL = ../arm/nrm2.c endif ifndef DNRM2KERNEL -DNRM2KERNEL = nrm2.c +DNRM2KERNEL = ../arm/nrm2.c endif ifndef CNRM2KERNEL -CNRM2KERNEL = znrm2.c +CNRM2KERNEL = ../arm/znrm2.c endif ifndef ZNRM2KERNEL -ZNRM2KERNEL = znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c endif ifndef SCABS_KERNEL From 1cb7b9015ebd49e1cbf09eb289b7a6d5bba5ea31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 4 Sep 2018 11:06:51 +0200 Subject: [PATCH 029/236] Conditional compilation of assembly files that IOS does not like --- kernel/arm64/KERNEL.ARMV8 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index d05754628..4c6d6fb71 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -51,10 +51,12 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S +ifneq ($(OS_DARWIN)$(CROSS),11) SNRM2KERNEL = nrm2.S DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S +endif SROTKERNEL = rot.S DROTKERNEL = rot.S @@ -86,7 +88,11 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ifneq ($(OS_DARWIN)$(CROSS),11) SGEMMKERNEL = sgemm_kernel_4x4.S +else +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +endif SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o From 8aeab0601e9787698a2af16e21bbaba9621183dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Sep 2018 16:39:52 +0200 Subject: [PATCH 030/236] Follow netlib renaming/aliasing CBLAS_ORDER to CBLAS_LAYOUT fixes #1754 --- cblas.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cblas.h b/cblas.h index 6461f4209..347089e5b 100644 --- a/cblas.h +++ b/cblas.h @@ -46,12 +46,13 @@ int openblas_get_parallel(void); #define CBLAS_INDEX size_t -typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_LAYOUT {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT; typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; - +typedef CBLAS_LAYOUT CBLAS_ORDER; + float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); From b57af9379270753ef69f4934ed7c57ee89f5833b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Sep 2018 16:54:31 +0200 Subject: [PATCH 031/236] just make CBLAS_LAYOUT an alias of the existing CBLAS_ORDER to avoid having to change all instances of enum CBLAS_ORDER in this file --- cblas.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cblas.h b/cblas.h index 347089e5b..d340a2037 100644 --- a/cblas.h +++ b/cblas.h @@ -46,12 +46,12 @@ int openblas_get_parallel(void); #define CBLAS_INDEX size_t -typedef enum CBLAS_LAYOUT {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT; +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; -typedef CBLAS_LAYOUT CBLAS_ORDER; +typedef CBLAS_ORDER CBLAS_LAYOUT; float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); From 4cf7315a5d5c512b1f38c523d4cd28c399b2000d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Sep 2018 21:41:54 +0200 Subject: [PATCH 032/236] Adjust ARMV8 SGEMM unrolling when using the C fallback kernel_2x2 for IOS --- param.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/param.h b/param.h index cfa4bba5c..ded9fe0b8 100644 --- a/param.h +++ b/param.h @@ -2590,8 +2590,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#if defined(OS_DARWIN) && defined(CROSS) +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL N 2 +#else #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 +#endif #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 From 1e531701b7ab24a069ec5e549fc08eaca49050a1 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 9 Sep 2018 16:52:25 +0200 Subject: [PATCH 033/236] fix small typo --- kernel/generic/trmm_lncopy_16.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/generic/trmm_lncopy_16.c b/kernel/generic/trmm_lncopy_16.c index 4c0a76cbd..0f4b0a9f7 100644 --- a/kernel/generic/trmm_lncopy_16.c +++ b/kernel/generic/trmm_lncopy_16.c @@ -661,7 +661,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; - b[ 11] = ZERO; + b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; From 58363542e73998250a6829e8aa4f4d4e8f94337f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Sep 2018 10:51:17 +0200 Subject: [PATCH 034/236] remove unused variable ldb_t Copied from Reference-LAPACK PR283 --- lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c index 2cc7b9ad2..dbd6e9049 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_dsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); double* a_t = NULL; double* tb_t = NULL; /* Check leading dimension(s) */ From 5cf090f516e7ea48316901fb3e1ea4ab086db25b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Sep 2018 10:52:30 +0200 Subject: [PATCH 035/236] remove unused variable ldb_t Copied from Reference-LAPACK PR283 --- lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c index 5b8010d9e..b9ba0fb56 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); lapack_complex_double* a_t = NULL; lapack_complex_double* tb_t = NULL; /* Check leading dimension(s) */ From 094f8c3b579468636cada39ead49c43532b91b62 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Sep 2018 10:53:47 +0200 Subject: [PATCH 036/236] remove unused variable ldb_t Copied from Reference-LAPACK PR283 --- lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c index f91c42257..db27e2873 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_zsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); lapack_complex_double* a_t = NULL; lapack_complex_double* tb_t = NULL; /* Check leading dimension(s) */ From 30f5a69ab858c0c110f8e188d924d5fb117d3f81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Sep 2018 14:23:31 +0200 Subject: [PATCH 037/236] Add explicit cast to silence a warning for #1710 --- interface/lapack/laswp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c index ebeb103e7..0dde33ae3 100644 --- a/interface/lapack/laswp.c +++ b/interface/lapack/laswp.c @@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, - laswp[flag], nthreads); + (int(*)())laswp[flag], nthreads); } #endif From f3c262156e88b204731c46221400d77c7b4f0c49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Sep 2018 14:24:29 +0200 Subject: [PATCH 038/236] Add an explicit cast to silence a warning for #1710 --- interface/lapack/zlaswp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c index 31e08451d..b77a40985 100644 --- a/interface/lapack/zlaswp.c +++ b/interface/lapack/zlaswp.c @@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads); } #endif From 2349e151497dc4686413d65954d5418519dfc320 Mon Sep 17 00:00:00 2001 From: Yuri Date: Sat, 15 Sep 2018 19:59:17 -0700 Subject: [PATCH 039/236] Allow to install the 'interfare64' version concurrently with the regular version --- CMakeLists.txt | 30 ++++++++++++++++++------------ cmake/fc.cmake | 5 +++++ cmake/openblas.pc.in | 3 ++- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97c3b7777..9513488c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,6 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(OpenBLAS_LIBNAME openblas) - ####### if(MSVC) option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) @@ -43,6 +41,8 @@ message(WARNING "CMake support is experimental. This will not produce the same M include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") +set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) + set(BLASDIRS interface driver/level2 driver/level3 driver/others) if (NOT DYNAMIC_ARCH) @@ -214,11 +214,15 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} - EXPORT "OpenBLASTargets" + EXPORT "OpenBLAS${SUFFIX64}Targets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +# Install headers +set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) +set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) + message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) @@ -266,29 +270,31 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() include(FindPkgConfig QUIET) if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) + configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) + install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(PN OpenBLAS) -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") configure_package_config_file(cmake/${PN}Config.cmake.in - "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake VERSION ${${PN}_VERSION} COMPATIBILITY AnyNewerVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) -install(EXPORT "${PN}Targets" - NAMESPACE "${PN}::" +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + RENAME ${PN}${SUFFIX64}ConfigVersion.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(EXPORT "${PN}${SUFFIX64}Targets" + NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 1446a900d..38d59f956 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,6 +3,11 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. +if (INTERFACE64) + set(SUFFIX64 64) + set(SUFFIX64_UNDERSCORE _64) +endif() + if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index ca88a6d5f..df4b2ab06 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,4 +1,5 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ @@ -6,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas +Libs: -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} From b402626509070764b2c6e0302e19c7b779372fe0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Sep 2018 12:43:36 +0200 Subject: [PATCH 040/236] Do not use the new TLS code for non-threaded builds even if USE_TLS is set Workaround for #1761 as that exposed a problem in the new code (which was intended to speed up multithreaded code only anyway). --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 9d4ab19f5..e73d53fa2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(USE_TLS) +#if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS #if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2,20) From 1ad1e79062d40cc9445e5c2098e15b8c45081a75 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Sep 2018 18:03:43 +0200 Subject: [PATCH 041/236] Catch inadvertent USE_TLS=0 declaration for #1766 --- driver/others/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index e73d53fa2..0019253c0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -75,6 +75,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS + +#if USE_TLS != 1 +#undef COMPILE_TLS +#endif + #if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2,20) #undef COMPILE_TLS From 288aeea8a285da8551c465681c7b9330a5486e7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Sep 2018 18:08:31 +0200 Subject: [PATCH 042/236] Fix default settings - USE_TLS and USE_SIMPLE_THREADED_LEVEL3 should both be off --- Makefile.rule | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 25ed0357d..8c651412e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -107,13 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 # BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. -USE_SIMPLE_THREADED_LEVEL3 = 1 +# USE_SIMPLE_THREADED_LEVEL3 = 1 # If you want to use the new, still somewhat experimental code that uses # thread-local storage instead of a central memory buffer in memory.c # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 # for this to work. -USE_TLS = 1 +# USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran # compiler supports this. It's safe to keep comment it out if you From 6f77af2eef8a6ea2c5e32c66528849c319d4fb6d Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Fri, 21 Sep 2018 09:19:51 +0000 Subject: [PATCH 043/236] Add `$(LDFLAGS)` to `$(CC)` and `$(FC)` invocations within `exports/Makefile` --- exports/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 29075a9c2..3a5f77db3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -114,9 +114,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c From cf6df9464c4e30d844726e986fbb8834fcdb8dc8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Sep 2018 12:31:37 +0200 Subject: [PATCH 044/236] Document the stub status of the QUAD_PRECiSION code (#1772) * Document the stub status of the QUAD_PRECiSION code inherited from GotoBLAS2 in response to #1769 --- Makefile.rule | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 8c651412e..6522b0777 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -152,6 +152,9 @@ NO_AFFINITY = 1 # FUNCTION_PROFILE = 1 # Support for IEEE quad precision(it's *real* REAL*16)( under testing) +# This option should not be used - it is a holdover from unfinished code present +# in the original GotoBLAS2 library that may be usable as a starting point but +# is not even expected to compile in its present form. # QUAD_PRECISION = 1 # Theads are still working for a while after finishing BLAS operation From 28aa94bf4be41324a46558d979e428bb4ca19a33 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Sep 2018 14:00:15 +0200 Subject: [PATCH 045/236] Include thread numbers in failure message from blas_thread_init to aid in debugging cases like #1767 --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 1d7f570d8..6a25e2d07 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -582,7 +582,7 @@ int blas_thread_init(void){ if(ret!=0){ struct rlimit rlim; const char *msg = strerror(ret); - fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " From 7e5df34e6afede4bcdaa20866353c96ae2512052 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 25 Sep 2018 09:41:58 +0200 Subject: [PATCH 046/236] Convert fldmia/fstmia instructions to UAL syntax for clang7 fixes #1774 --- kernel/arm/asum_vfp.S | 76 +++++----- kernel/arm/axpy_vfp.S | 124 +++++++-------- kernel/arm/ccopy_vfp.S | 28 ++-- kernel/arm/cdot_vfp.S | 40 ++--- kernel/arm/cgemm_kernel_2x2_vfp.S | 44 +++--- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 64 ++++---- kernel/arm/cgemm_tcopy_2_vfp.S | 20 +-- kernel/arm/cgemv_n_vfp.S | 32 ++-- kernel/arm/cgemv_t_vfp.S | 40 ++--- kernel/arm/ctrmm_kernel_2x2_vfp.S | 32 ++-- kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 52 +++---- kernel/arm/dcopy_vfp.S | 28 ++-- kernel/arm/ddot_vfp.S | 40 ++--- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 8 +- kernel/arm/dgemm_tcopy_4_vfp.S | 60 ++++---- kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 26 ++-- kernel/arm/gemv_n_vfp.S | 100 ++++++------- kernel/arm/gemv_n_vfpv3.S | 120 +++++++-------- kernel/arm/gemv_t_vfp.S | 168 ++++++++++----------- kernel/arm/gemv_t_vfpv3.S | 168 ++++++++++----------- kernel/arm/iamax_vfp.S | 32 ++-- kernel/arm/nrm2_vfp.S | 16 +- kernel/arm/nrm2_vfpv3.S | 16 +- kernel/arm/rot_vfp.S | 224 ++++++++++++++-------------- kernel/arm/scal_vfp.S | 76 +++++----- kernel/arm/scopy_vfp.S | 32 ++-- kernel/arm/sdot_vfp.S | 72 ++++----- kernel/arm/sgemm_kernel_4x2_vfp.S | 4 +- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 40 ++--- kernel/arm/sgemm_tcopy_4_vfp.S | 70 ++++----- kernel/arm/strmm_kernel_4x2_vfp.S | 4 +- kernel/arm/strmm_kernel_4x4_vfpv3.S | 34 ++--- kernel/arm/swap_vfp.S | 112 +++++++------- kernel/arm/zcopy_vfp.S | 28 ++-- kernel/arm/zdot_vfp.S | 40 ++--- kernel/arm/zgemm_kernel_2x2_vfp.S | 24 +-- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 24 +-- kernel/arm/zgemm_tcopy_2_vfp.S | 20 +-- kernel/arm/zgemv_n_vfp.S | 32 ++-- kernel/arm/zgemv_t_vfp.S | 40 ++--- 40 files changed, 1105 insertions(+), 1105 deletions(-) diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index 5b08e5028..9a75885a2 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vadd.f64 d1 , d1, d7 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 vadd.f32 s0 , s0, s6 vadd.f32 s1 , s1, s7 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index c35b8aece..39c9ac233 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } fmacd d9 , d0, d5 - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d9 } fmacd d10, d0, d6 - fstmiad Y!, { d10 } + vstmia.f64 Y!, { d10 } fmacd d11, d0, d7 - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d11 } .endm @@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y , { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } .endm .macro KERNEL_S1 - fldmiad X , { d4 } - fldmiad Y , { d8 } + vldmia.f64 X , { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y , { d8 } + vstmia.f64 Y , { d8 } add X, X, INC_X add Y, Y, INC_Y @@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } fmacs s9 , s0, s5 - fstmias Y!, { s9 } + vstmia.f32 Y!, { s9 } fmacs s10, s0, s6 - fstmias Y!, { s10 } + vstmia.f32 Y!, { s10 } fmacs s11, s0, s7 - fstmias Y!, { s11 } + vstmia.f32 Y!, { s11 } .endm @@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y , { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } .endm .macro KERNEL_S1 - fldmias X , { s4 } - fldmias Y , { s8 } + vldmia.f32 X , { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y , { s8 } + vstmia.f32 Y , { s8 } add X, X, INC_X add Y, Y, INC_Y @@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } @@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } @@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X , { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X , { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y , { d8 - d9 } + vstmia.f64 Y , { d8 - d9 } add X, X, INC_X add Y, Y, INC_Y @@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s7 } + vldmia.f32 X!, { s4 - s7 } pld [ Y, #X_PRE ] - fldmias Y , { s8 - s11 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } @@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } @@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X , { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X , { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y , { s8 - s9 } + vstmia.f32 Y , { s8 - s9 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S index 874fcab9c..fbb32b43c 100644 --- a/kernel/arm/ccopy_vfp.S +++ b/kernel/arm/ccopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmias X!, { s0 - s7 } - fstmias Y!, { s0 - s7 } + vldmia.f32 X!, { s0 - s7 } + vstmia.f32 Y!, { s0 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 - s1 } - fstmias Y!, { s0 - s1 } + vldmia.f32 X!, { s0 - s1 } + vstmia.f32 Y!, { s0 - s1 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index fd86a37b0..85246d734 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 fmacs s3 , s7, s10 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index 71bc50efd..d2591919e 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } - fldmias CO2, { s4 - s7 } + vldmia.f32 CO2, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } - fldmias CO2, { s4 - s5 } + vldmia.f32 CO2, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 9d473ad78..5ebc904ac 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } - fldmias CO2, { s8 - s11 } + vldmia.f32 CO1, { s4 - s7 } + vldmia.f32 CO2, { s8 - s11 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } - fldmias CO2, { s8 - s9 } + vldmia.f32 CO1, { s4 - s5 } + vldmia.f32 CO2, { s8 - s9 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_tcopy_2_vfp.S b/kernel/arm/cgemm_tcopy_2_vfp.S index 9036b994d..7b3ae18d4 100644 --- a/kernel/arm/cgemm_tcopy_2_vfp.S +++ b/kernel/arm/cgemm_tcopy_2_vfp.S @@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ .macro COPY2x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 -s1 } + vldmia.f32 AO1, { s0 -s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 62ee33bb9..d6b18c796 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, #8 @@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index c07b6d6f8..6833df7d1 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -150,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -180,7 +180,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -204,8 +204,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -216,14 +216,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -249,9 +249,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -269,25 +269,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -313,8 +313,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -327,14 +327,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index aae890ea9..ca1a512fb 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -513,7 +513,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -523,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -693,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -818,7 +818,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 79e7ed07f..d75fb7735 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -170,30 +170,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -206,17 +206,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -238,19 +238,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -354,8 +354,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -532,8 +532,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -710,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -835,7 +835,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S index da239924a..7ee52af88 100644 --- a/kernel/arm/dcopy_vfp.S +++ b/kernel/arm/dcopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmiad X!, { d0 - d3 } - fstmiad Y!, { d0 - d3 } + vldmia.f64 X!, { d0 - d3 } + vstmia.f64 Y!, { d0 - d3 } .endm .macro COPY_F1 - fldmiad X!, { d0 } - fstmiad Y!, { d0 } + vldmia.f64 X!, { d0 } + vstmia.f64 Y!, { d0 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index cc2e485b7..4dff5a3e1 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -67,26 +67,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d8 } + vldmia.f64 X!, { d8 } pld [ Y, #X_PRE ] - fldmiad Y!, { d4 } - fldmiad Y!, { d5 } + vldmia.f64 Y!, { d4 } + vldmia.f64 Y!, { d5 } fmacd d0 , d4, d8 - fldmiad X!, { d9 } - fldmiad Y!, { d6 } + vldmia.f64 X!, { d9 } + vldmia.f64 Y!, { d6 } fmacd d1 , d5, d9 - fldmiad X!, { d10 } - fldmiad X!, { d11 } + vldmia.f64 X!, { d10 } + vldmia.f64 X!, { d11 } fmacd d0 , d6, d10 - fldmiad Y!, { d7 } + vldmia.f64 Y!, { d7 } fmacd d1 , d7, d11 .endm .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y!, { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y!, { d8 } fmacd d0 , d4, d8 .endm @@ -97,26 +97,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d4, d8 - fldmiad X, { d5 } - fldmiad Y, { d9 } + vldmia.f64 X, { d5 } + vldmia.f64 Y, { d9 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d5, d9 - fldmiad X, { d6 } - fldmiad Y, { d10 } + vldmia.f64 X, { d6 } + vldmia.f64 Y, { d10 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d6, d10 - fldmiad X, { d7 } - fldmiad Y, { d11 } + vldmia.f64 X, { d7 } + vldmia.f64 Y, { d11 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d7, d11 @@ -126,8 +126,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X fmacd d0 , d4, d8 add Y, Y, INC_Y diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 1744b54d8..d852c2dad 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r4 , CO2, r3 pld [ CO2 , #C_PRE ] - fldmiad CO1, { d8 - d11 } + vldmia.f64 CO1, { d8 - d11 } pld [ r4 , #C_PRE ] fmacd d8 , d0 , d16 @@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d15, d0 , d23 fstd d11, [CO1, #24 ] - fldmiad r4, { d8 - d11 } + vldmia.f64 r4, { d8 - d11 } fmacd d8 , d0 , d24 fstd d12, [CO2] @@ -367,7 +367,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO2 , #C_PRE ] - fldmiad CO2, { d12 - d15 } + vldmia.f64 CO2, { d12 - d15 } fstd d8 , [r4 ] fmacd d12, d0 , d28 @@ -378,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmacd d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S index 937f43957..8335de27c 100644 --- a/kernel/arm/dgemm_tcopy_4_vfp.S +++ b/kernel/arm/dgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d8 - d11 } + vldmia.f64 r3, { d8 - d11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d12 - d15 } + vldmia.f64 r3, { d12 - d15 } - fstmiad BO1, { d0 - d15 } + vstmia.f64 BO1, { d0 - d15 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } add r3, r3, LDA - fldmiad r3, { d4 - d5 } + vldmia.f64 r3, { d4 - d5 } add r3, r3, LDA - fldmiad r3, { d6 - d7 } + vldmia.f64 r3, { d6 - d7 } - fstmiad BO2, { d0 - d7 } + vstmia.f64 BO2, { d0 - d7 } add AO1, AO1, #16 add BO2, BO2, #64 @@ -117,18 +117,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } add r3, r3, LDA - fldmiad r3, { d2 } + vldmia.f64 r3, { d2 } add r3, r3, LDA - fldmiad r3, { d3 } + vldmia.f64 r3, { d3 } - fstmiad BO3, { d0 - d3 } + vstmia.f64 BO3, { d0 - d3 } add AO1, AO1, #8 add BO3, BO3, #32 @@ -139,13 +139,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -153,12 +153,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -166,12 +166,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } - fstmiad BO3, { d0 - d1 } + vstmia.f64 BO3, { d0 - d1 } add AO1, AO1, #8 add BO3, BO3, #16 @@ -182,9 +182,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -192,9 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 @@ -202,9 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } - fstmiad BO3, { d0 } + vstmia.f64 BO3, { d0 } add AO1, AO1, #8 add BO3, BO3, #8 diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c0c6a1677..e73936cdd 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -128,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmuld d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmuld d17 , d1, d8 fldd d9 , [ BO, #8 ] fmuld d18 , d2, d8 @@ -148,10 +148,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuld d23 , d3, d9 fmuld d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmuld d25 , d1, d10 fmuld d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -173,10 +173,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmacd d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmacd d17 , d1, d8 fldd d9 , [ BO, #8 ] fmacd d18 , d2, d8 @@ -193,10 +193,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d23 , d3, d9 fmacd d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d25 , d1, d10 fmacd d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -225,11 +225,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] fmacd d21 , d5, d13 fmacd d22 , d6, d13 - fldmiad AO!, { d0 - d1 } + vldmia.f64 AO!, { d0 - d1 } fmacd d23 , d7, d13 fmacd d24 , d4, d14 - fldmiad AO!, { d2 - d3 } + vldmia.f64 AO!, { d2 - d3 } fmacd d25 , d5, d14 fldd d9 , [ BO, #8 ] fmacd d26 , d6, d14 @@ -257,10 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d19 , d3, d8 fmacd d20 , d0, d9 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d21 , d1, d9 fmacd d22 , d2, d9 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d23 , d3, d9 fmacd d24 , d0, d10 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmuld d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 7c154d741..753ac27c6 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -139,8 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmiad XO! , { d2 } - fldmiad AO1 , { d4 - d7 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d4 - d7 } vmla.f64 d8 , d2 , d4 pld [ AO2 , #4*SIZE ] @@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d11 , d2 , d7 - fldmiad r3 , { d4 - d7 } + vldmia.f64 r3 , { d4 - d7 } vmla.f64 d12 , d2 , d4 vmla.f64 d13 , d2 , d5 @@ -164,23 +164,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d8 vmla.f64 d5 , d0, d9 vmla.f64 d6 , d0, d10 vmla.f64 d7 , d0, d11 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d12 vmla.f64 d5 , d0, d13 vmla.f64 d6 , d0, d14 vmla.f64 d7 , d0, d15 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -195,8 +195,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -204,9 +204,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 pld [ AO2 , #A_PRE ] - fldmiad XO , { d2 } - fldmiad AO1 , { d8 - d11 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 - d11 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -249,24 +249,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d12 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d13 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d14 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d15 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA add XO, XO , INC_X @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO , { d4 } + vstmia.f64 YO , { d4 } add YO, YO, INC_Y .endm @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2, #A_PRE ] - fldmias XO! , { s2 } - fldmias AO1 , { s4 - s7 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s4 - s7 } vmla.f32 s8 , s2 , s4 vmla.f32 s9 , s2 , s5 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r3, AO1, #4*SIZE - fldmias r3 , { s4 - s7 } + vldmia.f32 r3 , { s4 - s7 } vmla.f32 s12 , s2 , s4 vmla.f32 s13 , s2 , s5 @@ -362,24 +362,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s8 vmla.f32 s5 , s0, s9 vmla.f32 s6 , s0, s10 vmla.f32 s7 , s0, s11 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s12 vmla.f32 s5 , s0, s13 vmla.f32 s6 , s0, s14 vmla.f32 s7 , s0, s15 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -394,8 +394,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA @@ -403,9 +403,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 - s11 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 - s11 } vmla.f32 s12 , s2 , s8 vmla.f32 s13 , s2 , s9 @@ -449,24 +449,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s12 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s13 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s14 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s15 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -482,8 +482,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA add XO, XO , INC_X @@ -492,9 +492,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO , { s4 } + vstmia.f32 YO , { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 54f958b7b..e80dc1458 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -138,8 +138,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 pld [ AO2 , #A_PRE ] @@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d16 - d23 } + vldmia.f64 YO, { d16 - d23 } vmla.f64 d16, d0, d24 vmla.f64 d17, d0, d25 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d22, d0, d30 vmla.f64 d23, d0, d31 - fstmiad YO!, { d16 - d23 } + vstmia.f64 YO!, { d16 - d23 } .endm @@ -184,8 +184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA @@ -193,9 +193,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO!, { d16 } + vstmia.f64 YO!, { d16 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO2 , #A_PRE ] pld [ AO2 , #A_PRE+32 ] - fldmiad XO , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 vmla.f64 d25 , d4 , d9 @@ -253,44 +253,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y - fldmiad YO, { d17 } + vldmia.f64 YO, { d17 } vmla.f64 d17, d0, d25 - fstmiad YO, { d17 } + vstmia.f64 YO, { d17 } add YO, YO, INC_Y - fldmiad YO, { d18 } + vldmia.f64 YO, { d18 } vmla.f64 d18, d0, d26 - fstmiad YO, { d18 } + vstmia.f64 YO, { d18 } add YO, YO, INC_Y - fldmiad YO, { d19 } + vldmia.f64 YO, { d19 } vmla.f64 d19, d0, d27 - fstmiad YO, { d19 } + vstmia.f64 YO, { d19 } add YO, YO, INC_Y - fldmiad YO, { d20 } + vldmia.f64 YO, { d20 } vmla.f64 d20, d0, d28 - fstmiad YO, { d20 } + vstmia.f64 YO, { d20 } add YO, YO, INC_Y - fldmiad YO, { d21 } + vldmia.f64 YO, { d21 } vmla.f64 d21, d0, d29 - fstmiad YO, { d21 } + vstmia.f64 YO, { d21 } add YO, YO, INC_Y - fldmiad YO, { d22 } + vldmia.f64 YO, { d22 } vmla.f64 d22, d0, d30 - fstmiad YO, { d22 } + vstmia.f64 YO, { d22 } add YO, YO, INC_Y - fldmiad YO, { d23 } + vldmia.f64 YO, { d23 } vmla.f64 d23, d0, d31 - fstmiad YO, { d23 } + vstmia.f64 YO, { d23 } add YO, YO, INC_Y .endm @@ -306,8 +306,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA add XO, XO, INC_X @@ -316,9 +316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y .endm @@ -361,8 +361,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmias XO! , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -379,7 +379,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s16 - s23 } + vldmia.f32 YO, { s16 - s23 } vmla.f32 s16, s0, s24 vmla.f32 s17, s0, s25 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f32 s22, s0, s30 vmla.f32 s23, s0, s31 - fstmias YO!, { s16 - s23 } + vstmia.f32 YO!, { s16 - s23 } .endm @@ -405,8 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA @@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO!, { s16 } + vstmia.f32 YO!, { s16 } .endm @@ -454,8 +454,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S8X1 pld [ AO2 , #A_PRE ] - fldmias XO , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -473,44 +473,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y - fldmias YO, { s17 } + vldmia.f32 YO, { s17 } vmla.f32 s17, s0, s25 - fstmias YO, { s17 } + vstmia.f32 YO, { s17 } add YO, YO, INC_Y - fldmias YO, { s18 } + vldmia.f32 YO, { s18 } vmla.f32 s18, s0, s26 - fstmias YO, { s18 } + vstmia.f32 YO, { s18 } add YO, YO, INC_Y - fldmias YO, { s19 } + vldmia.f32 YO, { s19 } vmla.f32 s19, s0, s27 - fstmias YO, { s19 } + vstmia.f32 YO, { s19 } add YO, YO, INC_Y - fldmias YO, { s20 } + vldmia.f32 YO, { s20 } vmla.f32 s20, s0, s28 - fstmias YO, { s20 } + vstmia.f32 YO, { s20 } add YO, YO, INC_Y - fldmias YO, { s21 } + vldmia.f32 YO, { s21 } vmla.f32 s21, s0, s29 - fstmias YO, { s21 } + vstmia.f32 YO, { s21 } add YO, YO, INC_Y - fldmias YO, { s22 } + vldmia.f32 YO, { s22 } vmla.f32 s22, s0, s30 - fstmias YO, { s22 } + vstmia.f32 YO, { s22 } add YO, YO, INC_Y - fldmias YO, { s23 } + vldmia.f32 YO, { s23 } vmla.f32 s23, s0, s31 - fstmias YO, { s23 } + vstmia.f32 YO, { s23 } add YO, YO, INC_Y .endm @@ -526,8 +526,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA add XO, XO, INC_X @@ -536,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 9559d1829..fbe51cc8c 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -112,13 +112,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO2!, { d4 - d5 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } vmla.f64 d2 , d12 , d8 vmla.f64 d3 , d12 , d4 @@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 vmla.f64 d3 , d1 , d4 @@ -143,10 +143,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } vmla.f64 d4, d0, d2 vmla.f64 d5, d0, d3 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -160,10 +160,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d2 , d12 , d8 vmla.f64 d2 , d13 , d9 vmla.f64 d2 , d14, d10 @@ -173,17 +173,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -197,23 +197,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } + vldmia.f64 AO2!, { d4 - d5 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -229,9 +229,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X vmla.f64 d3 , d1 , d4 @@ -240,14 +240,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5, d0, d3 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -261,20 +261,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -286,8 +286,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X @@ -295,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y .endm @@ -315,11 +315,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } vmla.f32 s2 , s12 , s8 vmla.f32 s3 , s12 , s4 @@ -334,9 +334,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 vmla.f32 s3 , s1 , s4 @@ -344,10 +344,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } vmla.f32 s4, s0, s2 vmla.f32 s5, s0, s3 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -359,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO1!, { s10 - s11 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s2 , s12 , s8 vmla.f32 s2 , s13 , s9 vmla.f32 s2 , s14, s10 @@ -371,17 +371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 .endm .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -395,21 +395,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -425,9 +425,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X vmla.f32 s3 , s1 , s4 @@ -436,14 +436,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5, s0, s3 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -456,20 +456,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -481,8 +481,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X @@ -490,9 +490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index b1d3dadf1..a88d70016 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -108,17 +108,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -129,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d16 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -139,10 +139,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d24 - d25 } + vldmia.f64 YO, { d24 - d25 } vmla.f64 d24, d0, d4 vmla.f64 d25, d0, d5 - fstmiad YO!, { d24 - d25 } + vstmia.f64 YO!, { d24 - d25 } .endm @@ -156,23 +156,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -183,10 +183,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X - fldmiad AO2!, { d16 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -194,14 +194,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d5 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d4 , d30, d10 vmla.f64 d4 , d31, d11 @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } vmla.f64 d4 , d2 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO!, { d24 } + vstmia.f64 YO!, { d24 } .endm @@ -252,18 +252,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d4 , d31, d11 @@ -272,8 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X vmla.f64 d4 , d2 , d8 @@ -281,9 +281,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -300,15 +300,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -319,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } - fldmias AO2!, { s16 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -329,10 +329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s24 - s25 } + vldmia.f32 YO, { s24 - s25 } vmla.f32 s24, s0, s4 vmla.f32 s25, s0, s5 - fstmias YO!, { s24 - s25 } + vstmia.f32 YO!, { s24 - s25 } .endm @@ -345,22 +345,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -371,10 +371,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X - fldmias AO2!, { s16 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -382,14 +382,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s5 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm @@ -402,10 +402,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s4 , s30, s10 vmla.f32 s4 , s31, s11 @@ -415,17 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } vmla.f32 s4 , s2 , s8 .endm .macro SAVE_F1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO!, { s24 } + vstmia.f32 YO!, { s24 } .endm @@ -437,18 +437,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s4 , s31, s11 @@ -457,8 +457,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X vmla.f32 s4 , s2 , s8 @@ -466,9 +466,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index fab05c9c8..fd43b15b1 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 } + vldmia.f64 X!, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 } + vldmia.f64 X, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -146,7 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 } + vldmia.f32 X!, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -182,7 +182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 } + vldmia.f32 X, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -193,7 +193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -215,7 +215,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 -d1 } + vldmia.f64 X!, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -241,7 +241,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 -d1 } + vldmia.f64 X, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -272,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 -s1 } + vldmia.f32 X!, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -284,7 +284,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 -s1 } + vldmia.f32 X, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -312,7 +312,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index 16ac5a632..8e0937851 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 84977901d..7be1e977e 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index ea296dbc5..6aec06205 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -77,68 +77,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X, { d2 } - fstmiad Y, { d3 } + vstmia.f64 X, { d2 } + vstmia.f64 Y, { d3 } add X, X, INC_X add Y, Y, INC_Y @@ -149,68 +149,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X, { s2 } - fstmias Y, { s3 } + vstmia.f32 X, { s2 } + vstmia.f32 Y, { s3 } add X, X, INC_X add Y, Y, INC_Y @@ -230,96 +230,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 @@ -347,96 +347,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index cc3e3b98d..8992c35a8 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -64,30 +64,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X, { d4 - d7 } + vldmia.f64 X, { d4 - d7 } vmul.f64 d4, d4, d0 vmul.f64 d5, d5, d0 vmul.f64 d6, d6, d0 - fstmiad X!, { d4 - d5 } + vstmia.f64 X!, { d4 - d5 } vmul.f64 d7, d7, d0 - fstmiad X!, { d6 - d7 } + vstmia.f64 X!, { d6 - d7 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X!, { d4 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X, { d4 } + vstmia.f64 X, { d4 } add X, X, INC_X .endm @@ -96,30 +96,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 - s7 } + vldmia.f32 X, { s4 - s7 } vmul.f32 s4, s4, s0 vmul.f32 s5, s5, s0 vmul.f32 s6, s6, s0 - fstmias X!, { s4 - s5 } + vstmia.f32 X!, { s4 - s5 } vmul.f32 s7, s7, s0 - fstmias X!, { s6 - s7 } + vstmia.f32 X!, { s6 - s7 } .endm .macro KERNEL_F1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X!, { s4 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X, { s4 } + vstmia.f32 X, { s4 } add X, X, INC_X .endm @@ -136,58 +136,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X, { d2 - d3 } + vstmia.f64 X, { d2 - d3 } add X, X, INC_X .endm @@ -199,56 +199,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X, { s2 - s3 } + vstmia.f32 X, { s2 - s3 } add X, X, INC_X .endm diff --git a/kernel/arm/scopy_vfp.S b/kernel/arm/scopy_vfp.S index 0fd815db8..1ccd29c95 100644 --- a/kernel/arm/scopy_vfp.S +++ b/kernel/arm/scopy_vfp.S @@ -65,17 +65,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F8 pld [ X, #X_PRE ] - fldmias X!, { s0 - s3 } - fldmias X!, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias Y!, { s4 - s7 } + vldmia.f32 X!, { s0 - s3 } + vldmia.f32 X!, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 Y!, { s4 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 } - fstmias Y!, { s0 } + vldmia.f32 X!, { s0 } + vstmia.f32 Y!, { s0 } .endm @@ -85,23 +85,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y @@ -110,8 +110,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index 544846258..bb374b5ee 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -68,26 +68,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -96,8 +96,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -109,32 +109,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -146,8 +146,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -162,12 +162,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s8 - s9 } - fldmias Y!, { s4 - s5} + vldmia.f32 X!, { s8 - s9 } + vldmia.f32 Y!, { s4 - s5} fmacs s0 , s4, s8 - fldmias X!, { s10 - s11 } + vldmia.f32 X!, { s10 - s11 } fmacs s1 , s5, s9 - fldmias Y!, { s6 - s7 } + vldmia.f32 Y!, { s6 - s7 } fmacs s0 , s6, s10 fmacs s1 , s7, s11 @@ -175,8 +175,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y!, { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y!, { s8 } fmacs s0 , s4, s8 .endm @@ -185,26 +185,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s4, s8 - fldmias X, { s5 } - fldmias Y, { s9 } + vldmia.f32 X, { s5 } + vldmia.f32 Y, { s9 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s5, s9 - fldmias X, { s6 } - fldmias Y, { s10 } + vldmia.f32 X, { s6 } + vldmia.f32 Y, { s10 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s6, s10 - fldmias X, { s7 } - fldmias Y, { s11 } + vldmia.f32 X, { s7 } + vldmia.f32 Y, { s11 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s7, s11 @@ -214,8 +214,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X fmacs s0 , s4, s8 add Y, Y, INC_Y diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index 1f21e5a1f..c072f4126 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -112,8 +112,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO! , { s0 - s3 } - fldmias BO! , { s4 - s5 } + vldmia.f32 AO! , { s0 - s3 } + vldmia.f32 BO! , { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 6491d3571..789643f56 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -136,29 +136,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I pld [ AO , #A_PRE ] - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ BO , #B_PRE ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -174,20 +174,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias BO!, { s8 - s11 } + vldmia.f32 BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - //fldmias AO!, { s2 - s3 } + //vldmia.f32 AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - //fldmias BO!, { s10 - s11 } + //vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -203,17 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s7 } + vldmia.f32 AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias BO!, { s12 - s15 } - //fldmias AO!, { s6 - s7 } + vldmia.f32 BO!, { s12 - s15 } + //vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 fmacs s21 , s1, s9 fmacs s22 , s2, s9 - //fldmias BO!, { s14 - s15 } + //vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -300,7 +300,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA add r4 , CO2, r3 - fldmias CO1, { s8 - s11 } + vldmia.f32 CO1, { s8 - s11 } fmacs s8 , s0 , s16 flds s12, [CO2] @@ -322,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO1 , #C_PRE ] - fldmias r4, { s8 - s11 } + vldmia.f32 r4, { s8 - s11 } fmacs s8 , s0 , s24 fsts s12, [CO2] @@ -338,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add CO2, r4 , r3 - fldmias CO2, { s12 - s15 } + vldmia.f32 CO2, { s12 - s15 } fsts s8 , [r4 ] fmacs s12, s0 , s28 @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s15, s0 , s31 pld [ r4 , #C_PRE ] - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } pld [ CO2 , #C_PRE ] add CO1, CO1, #16 diff --git a/kernel/arm/sgemm_tcopy_4_vfp.S b/kernel/arm/sgemm_tcopy_4_vfp.S index 9bb0e46b1..e61613c5c 100644 --- a/kernel/arm/sgemm_tcopy_4_vfp.S +++ b/kernel/arm/sgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_1 pld [ AO1, #A_PRE ] - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -118,18 +118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } add r3, r3, LDA - fldmias r3, { s4 - s5 } + vldmia.f32 r3, { s4 - s5 } add r3, r3, LDA - fldmias r3, { s6 - s7 } + vldmia.f32 r3, { s6 - s7 } - fstmias BO2, { s0 - s7 } + vstmia.f32 BO2, { s0 - s7 } add AO1, AO1, #8 add BO2, BO2, #32 @@ -137,18 +137,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } add r3, r3, LDA - fldmias r3, { s2 } + vldmia.f32 r3, { s2 } add r3, r3, LDA - fldmias r3, { s3 } + vldmia.f32 r3, { s3 } - fstmias BO3, { s0 - s3 } + vstmia.f32 BO3, { s0 - s3 } add AO1, AO1, #4 add BO3, BO3, #16 @@ -158,12 +158,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -171,12 +171,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -184,12 +184,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } - fstmias BO3, { s0 - s1 } + vstmia.f32 BO3, { s0 - s1 } add AO1, AO1, #4 add BO3, BO3, #8 @@ -199,9 +199,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -209,9 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 @@ -219,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } - fstmias BO3, { s0 } + vstmia.f32 BO3, { s0 } add AO1, AO1, #4 add BO3, BO3, #4 diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 635b1dd13..34fa0ee39 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -118,8 +118,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s5 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index e24d24eba..0f601d5b8 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -122,30 +122,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ AO , #A_PRE-8 ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } pld [ BO , #B_PRE-8 ] fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -161,20 +161,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -190,17 +190,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s21 , s1, s9 fmacs s22 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -325,7 +325,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [r4 , #12 ] fmuls s15, s0 , s31 - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } add CO1, CO1, #16 diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 76661da79..0b3d98912 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -103,29 +103,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y!, { d0 } - fstmiad X!, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y!, { d0 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y, { d0 } - fstmiad X, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y, { d0 } + vstmia.f64 X, { d4 } add X, X, INC_X add Y, Y, INC_Y @@ -135,29 +135,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y!, { s0 } - fstmias X!, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y!, { s0 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y, { s0 } - fstmias X, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y, { s0 } + vstmia.f32 X, { s4 } add X, X, INC_X add Y, Y, INC_Y @@ -174,35 +174,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y!, { d0 - d1 } - fstmiad X!, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y!, { d0 - d1 } + vstmia.f64 X!, { d4 - d5 } .endm .macro KERNEL_S1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y, { d0 - d1 } - fstmiad X, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y, { d0 - d1 } + vstmia.f64 X, { d4 - d5 } add X, X, INC_X add Y, Y, INC_Y @@ -215,33 +215,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y!, { s0 - s1 } - fstmias X!, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y!, { s0 - s1 } + vstmia.f32 X!, { s4 - s5 } .endm .macro KERNEL_S1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y, { s0 - s1 } - fstmias X, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y, { s0 - s1 } + vstmia.f32 X, { s4 - s5 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S index 48aee4ce0..899dd1e36 100644 --- a/kernel/arm/zcopy_vfp.S +++ b/kernel/arm/zcopy_vfp.S @@ -66,15 +66,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ X, #X_PRE+32 ] - fldmiad X!, { d0 - d7 } - fstmiad Y!, { d0 - d7 } + vldmia.f64 X!, { d0 - d7 } + vstmia.f64 Y!, { d0 - d7 } .endm .macro COPY_F1 - fldmiad X!, { d0 - d1 } - fstmiad Y!, { d0 - d1 } + vldmia.f64 X!, { d0 - d1 } + vstmia.f64 Y!, { d0 - d1 } .endm @@ -84,23 +84,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index c0cd92d3c..5ef9f16a9 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -76,15 +76,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 pld [ X, #X_PRE ] @@ -93,15 +93,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 fmacd d2 , d7, d11 @@ -111,8 +111,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -127,8 +127,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -136,8 +136,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -145,8 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -168,8 +168,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 53d18b07b..7934a500e 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -360,7 +360,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -372,9 +372,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } - fldmiad CO2, { d4 - d7 } + vldmia.f64 CO2, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -386,7 +386,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -543,23 +543,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } - fldmiad CO2, { d4 - d5 } + vldmia.f64 CO2, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -714,7 +714,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -726,7 +726,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -843,14 +843,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index a9d4eddeb..cbb10f342 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -374,8 +374,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } - fldmiad CO2, { d8 - d11 } + vldmia.f64 CO1, { d4 - d7 } + vldmia.f64 CO2, { d8 - d11 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -406,8 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -570,8 +570,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } - fldmiad CO2, { d8 - d9 } + vldmia.f64 CO1, { d4 - d5 } + vldmia.f64 CO2, { d8 - d9 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -588,8 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -752,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -769,7 +769,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -887,7 +887,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -897,7 +897,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_tcopy_2_vfp.S b/kernel/arm/zgemm_tcopy_2_vfp.S index 7e27ca6a6..5e1a384b1 100644 --- a/kernel/arm/zgemm_tcopy_2_vfp.S +++ b/kernel/arm/zgemm_tcopy_2_vfp.S @@ -74,13 +74,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -88,12 +88,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 -d1 } + vldmia.f64 AO1, { d0 -d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -102,9 +102,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -112,9 +112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index 3e3a1bc07..4e64d8785 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -204,7 +204,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -216,9 +216,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -230,7 +230,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -269,14 +269,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, #16 @@ -352,47 +352,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 2193083af..c66fa4fb8 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -151,12 +151,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 - fldmiad AO2!, { d8 - d9 } + vldmia.f64 AO2!, { d8 - d9 } KMAC_R d12 , d5 , d3 KMAC_I d13 , d5 , d2 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -205,8 +205,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -217,14 +217,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -250,9 +250,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } - fldmiad AO2!, { d8 - d9 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } + vldmia.f64 AO2!, { d8 - d9 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -270,25 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -314,8 +314,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -328,14 +328,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y From 6fc85a63596bd1fe85f066f35c358b5815d38fe1 Mon Sep 17 00:00:00 2001 From: fengruilin Date: Wed, 26 Sep 2018 15:14:04 +0800 Subject: [PATCH 047/236] test_axpy work error on LOONGSON3A platform #1777 --- kernel/mips64/axpy_loongson3a.S | 14 ++++++++++++++ kernel/mips64/daxpy_loongson3a_simd.S | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/kernel/mips64/axpy_loongson3a.S b/kernel/mips64/axpy_loongson3a.S index 5904bc580..765e5ebbb 100644 --- a/kernel/mips64/axpy_loongson3a.S +++ b/kernel/mips64/axpy_loongson3a.S @@ -270,6 +270,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .L20: + beqz INCY, .L27 dsra I, N, 3 move YY, Y @@ -450,5 +451,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index f54008bc2..23225770a 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -562,6 +562,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //INCX!=1 or INCY != 1 .L20: + beq INCY, $0, .L27 dsra I, N, 3 move YY, Y @@ -754,5 +755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE From 9b2a7ad40d22e08f7d3a2e1443aa3f8a10c7b77f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Sep 2018 23:05:15 +0200 Subject: [PATCH 048/236] Convert fldmia/fstmia instructions to UAL syntax for clang7 second part of fix for #1774, containing files missed in #1775 --- kernel/arm/cgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/dgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/dgemm_ncopy_4_vfp.S | 16 ++++++++-------- kernel/arm/sgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/sgemm_ncopy_4_vfp.S | 16 ++++++++-------- kernel/arm/zgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/ztrmm_kernel_2x2_vfp.S | 12 ++++++------ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 12 ++++++------ 8 files changed, 44 insertions(+), 44 deletions(-) diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S index 29eeab492..fe4959988 100644 --- a/kernel/arm/cgemm_ncopy_2_vfp.S +++ b/kernel/arm/cgemm_ncopy_2_vfp.S @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s6 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S index 6266c61d2..9642b6478 100644 --- a/kernel/arm/dgemm_ncopy_2_vfp.S +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S index ffc19a9cc..5760cbd8a 100644 --- a/kernel/arm/dgemm_ncopy_4_vfp.S +++ b/kernel/arm/dgemm_ncopy_4_vfp.S @@ -105,10 +105,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d11, [ AO4, #16 ] fldd d15, [ AO4, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #32 - fstmiad BO!, { d4 - d7 } - fstmiad BO!, { d8 - d15 } + vstmia.f64 BO!, { d4 - d7 } + vstmia.f64 BO!, { d8 - d15 } .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO4, #0 ] add AO3, AO3, #8 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #8 .endm @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d5 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -164,7 +164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -174,7 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S index ff4ff0845..dd4596602 100644 --- a/kernel/arm/sgemm_ncopy_2_vfp.S +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/sgemm_ncopy_4_vfp.S b/kernel/arm/sgemm_ncopy_4_vfp.S index ab013134e..dbcea5961 100644 --- a/kernel/arm/sgemm_ncopy_4_vfp.S +++ b/kernel/arm/sgemm_ncopy_4_vfp.S @@ -100,10 +100,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s11, [ AO4, #8 ] flds s15, [ AO4, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #16 - fstmias BO!, { s4 - s7 } - fstmias BO!, { s8 - s15 } + vstmia.f32 BO!, { s4 - s7 } + vstmia.f32 BO!, { s8 - s15 } .endm @@ -117,7 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO4, #0 ] add AO3, AO3, #4 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #4 .endm @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s5 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -147,7 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S index b3fa225bb..d0661da2a 100644 --- a/kernel/arm/zgemm_ncopy_2_vfp.S +++ b/kernel/arm/zgemm_ncopy_2_vfp.S @@ -87,7 +87,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d6 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -101,7 +101,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -113,7 +113,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -124,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index cb6bc050e..4393bc9f6 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -385,7 +385,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -402,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -567,7 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -577,7 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 3e6962f06..39b12caa0 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -391,8 +391,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -569,8 +569,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 From 45fe8cb0c5d06f890913e86078cb48ac379c65dc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 3 Oct 2018 14:45:25 +0000 Subject: [PATCH 049/236] Create a AVX512 enabled version of DGEMM This patch adds dgemm_kernel_4x8_skylakex.c which is * dgemm_kernel_4x8_haswell.s converted to C + intrinsics * 8x8 support added * 8x8 kernel implemented using AVX512 Performance is a work in progress, but already shows a 10% - 20% increase for a wide range of matrix sizes. --- kernel/x86_64/KERNEL.SKYLAKEX | 16 +- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 1288 +++++++++++++++++++++ 2 files changed, 1293 insertions(+), 11 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_4x8_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 1256f4c3c..ba149512d 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -2,18 +2,12 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -#DTRMMKERNEL = ../generic/trmmkernel_16x2.c -#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S -#DGEMMINCOPY = ../generic/gemm_ncopy_16.c -#DGEMMITCOPY = ../generic/gemm_tcopy_16.c -#DGEMMONCOPY = ../generic/gemm_ncopy_2.c -#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c new file mode 100644 index 000000000..4162611ff --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -0,0 +1,1288 @@ +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/* + * This file is based on dgemm_kernel_4x8_haswell.s (original copyright above). + * The content got translated from ASM to C+intrinsics, significantly simplified, + * and AVX512 support added by Arjan van de Ven + */ + + +#include "common.h" +#include + + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + + +/******************************************************************************************/ + + +#define INIT4x8() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + ymm8 = _mm256_setzero_pd(); \ + ymm9 = _mm256_setzero_pd(); \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + + +#define KERNEL4x8_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ +/* ymm0 [ A B C D ] */ \ + ymm1 = _mm256_loadu_pd(BO - 12); \ + ymm2 = _mm256_loadu_pd(BO - 8); \ +/* ymm1 [ 1 2 3 4 ] */ \ +/* ymm2 [ 5 6 7 8 ] */ \ + \ + ymm4 += ymm0 * ymm1; \ +/* ymm4 += [ A*1 | B*2 | C*3 | D*4 ] */ \ + ymm8 += ymm0 * ymm2; \ +/* ymm8 += [ A*5 | B*6 | C*7 | D*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ B A D C ] */ \ + ymm5 += ymm0 * ymm1; \ +/* ymm5 += [ B*1 | A*2 | D*3 | C*4 ] */ \ + ymm9 += ymm0 * ymm2; \ +/* ymm9 += [ B*5 | A*6 | D*7 | C*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ +/* ymm0 [ C D A B ]] */ \ + ymm6 += ymm0 * ymm1; \ +/* ymm6 += [ C*1 | D*2 | A*3 | B*4 ] */ \ + ymm10+= ymm0 * ymm2; \ +/* ymm10 += [ C*5 | D*6 | A*7 | B*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ D C B A ] */ \ + ymm7 += ymm0 * ymm1; \ +/* ymm7 += [ D*1 | C*2 | B*3 | A*4 ] */ \ + ymm11+= ymm0 * ymm2; \ +/* ymm11 += [ D*5 | C*6 | B*7 | A*8 ] */ \ + AO += 4; \ + BO += 8; + + +#define SAVE4x8(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + ymm8 *= ymm0; \ + ymm9 *= ymm0; \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + \ +/* Entry values: */ \ +/* ymm4 = a [ A*1 | B*2 | C*3 | D*4 ] */ \ +/* ymm5 = a [ B*1 | A*2 | D*3 | C*4 ] */ \ +/* ymm6 = a [ C*1 | D*2 | A*3 | B*4 ] */ \ +/* ymm7 = a [ D*1 | C*2 | B*3 | A*4 ] */ \ +/* ymm8 = a [ A*5 | B*6 | C*7 | D*8 ] */ \ +/* ymm9 = a [ B*5 | A*6 | D*7 | C*8 ] */ \ +/* ymm10 = a [ C*5 | D*6 | A*7 | B*8 ] */ \ +/* ymm11 = a [ D*5 | C*6 | B*7 | A*8 ] */ \ + \ + ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ +/* ymm5 = a [ A*2 | B*1 | C*4 | D*3 ] */ \ + ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ +/* ymm7 = a [ C*2 | D*1 | A*4 | B*3 ] */ \ + \ + ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ +/* ymm0 = a [ A*1 | B*1 | C*3 | D*3 ] */ \ +/* ymm1 = a [ A*2 | B*2 | C*4 | D*4 ] */ \ + ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ +/* ymm2 = a [ C*1 | D*1 | A*3 | B*3 ] */ \ +/* ymm3 = a [ C*2 | D*2 | A*4 | B*4 ] */ \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ +/* ymm2 = a [ B*3 | A*3 | D*1 | C*1 ] */ \ +/* ymm3 = a [ B*4 | A*4 | D*2 | C*2 ] */ \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ +/* ymm2 = a [ A*3 | B*3 | C*1 | D*1 ] */ \ +/* ymm3 = a [ A*4 | B*4 | C*2 | D*2 ] */ \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ +/* ymm4 = a [ A*1 | B*1 | C*1 | D*1 ] */ \ +/* ymm5 = a [ A*2 | B*2 | C*2 | D*2 ] */ \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ +/* ymm5 = a [ A*3 | B*3 | C*3 | D*3 ] */ \ +/* ymm7 = a [ A*4 | B*4 | C*4 | D*4 ] */ \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + ymm9 = _mm256_permute4x64_pd(ymm9, 0xb1); \ + ymm11 = _mm256_permute4x64_pd(ymm11, 0xb1); \ + \ + ymm0 = _mm256_blend_pd(ymm8, ymm9, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm8, ymm9, 0x05); \ + ymm2 = _mm256_blend_pd(ymm10, ymm11, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm10, ymm11, 0x05); \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (4 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (5 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (6 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (7 * ldc)); \ + _mm256_storeu_pd(CO1 + (4 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (5 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (6 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (7 * ldc), ymm7); \ + \ + CO1 += 4; + +/******************************************************************************************/ + +#define INIT2x8() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + xmm8 = _mm_setzero_pd(); \ + xmm9 = _mm_setzero_pd(); \ + xmm10 = _mm_setzero_pd(); \ + xmm11 = _mm_setzero_pd(); \ + + +#define KERNEL2x8_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 8)); \ + xmm6 += xmm0 * xmm3; \ + xmm3 = _mm_set1_pd(*(BO - 7)); \ + xmm7 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 6)); \ + xmm8 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 5)); \ + xmm9 += xmm0 * xmm3; \ + xmm10 += xmm0 * xmm1; \ + xmm11 += xmm0 * xmm2; \ + BO += 8; \ + AO += 2; + +#define SAVE2x8(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + xmm8 *= xmm0; \ + xmm9 *= xmm0; \ + xmm10 *= xmm0; \ + xmm11 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + xmm8 += _mm_loadu_pd(CO1 + (4 * ldc)); \ + xmm9 += _mm_loadu_pd(CO1 + (5 * ldc)); \ + xmm10+= _mm_loadu_pd(CO1 + (6 * ldc)); \ + xmm11+= _mm_loadu_pd(CO1 + (7 * ldc)); \ + _mm_storeu_pd(CO1 + (4 * ldc), xmm8); \ + _mm_storeu_pd(CO1 + (5 * ldc), xmm9); \ + _mm_storeu_pd(CO1 + (6 * ldc), xmm10); \ + _mm_storeu_pd(CO1 + (7 * ldc), xmm11); \ + CO1 += 2; + + + + +/******************************************************************************************/ + +#define INIT1x8() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + dbl8 = 0; \ + dbl9 = 0; \ + dbl10 = 0; \ + dbl11 = 0; + + +#define KERNEL1x8_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl4 += dbl0 * dbl1; \ + dbl1 = *(BO - 9); \ + dbl5 += dbl0 * dbl2; \ + dbl2 = *(BO - 8); \ + dbl6 += dbl0 * dbl3; \ + dbl3 = *(BO - 7); \ + dbl7 += dbl0 * dbl1; \ + dbl1 = *(BO - 6); \ + dbl8 += dbl0 * dbl2; \ + dbl2 = *(BO - 5); \ + dbl9 += dbl0 * dbl3; \ + dbl10 += dbl0 * dbl1; \ + dbl11 += dbl0 * dbl2; \ + BO += 8; \ + AO += 1; + + +#define SAVE1x8(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + dbl8 *= dbl0; \ + dbl9 *= dbl0; \ + dbl10 *= dbl0; \ + dbl11 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + dbl8 += *(CO1 + (4 * ldc)); \ + dbl9 += *(CO1 + (5 * ldc)); \ + dbl10 += *(CO1 + (6 * ldc)); \ + dbl11 += *(CO1 + (7 * ldc)); \ + *(CO1 + (4 * ldc)) = dbl8; \ + *(CO1 + (5 * ldc)) = dbl9; \ + *(CO1 + (6 * ldc)) = dbl10; \ + *(CO1 + (7 * ldc)) = dbl11; \ + \ + CO1 += 1; + + + + + + +/******************************************************************************************/ + +#define INIT4x4() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL4x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(BO - 12); \ + \ + ymm4 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm5 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ + ymm6 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm7 += ymm0 * ymm1; \ + AO += 4; \ + BO += 4; + + +#define SAVE4x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ + ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ + \ + ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ + ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x4() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + + +#define KERNEL2x4_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm0 * xmm1; \ + BO += 4; \ + AO += 2; + + + +#define SAVE2x4(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + CO1 += 2; + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x4() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + +#define KERNEL1x4_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl8 = *(BO - 9); \ + \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + dbl6 += dbl0 * dbl3; \ + dbl7 += dbl0 * dbl8; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + \ + CO1 += 1; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT8x4() \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + ymm12 = _mm256_setzero_pd(); \ + ymm13 = _mm256_setzero_pd(); \ + ymm14 = _mm256_setzero_pd(); \ + ymm15 = _mm256_setzero_pd(); \ + ymm16 = _mm256_setzero_pd(); \ + ymm17 = _mm256_setzero_pd(); \ + + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 = _mm256_set1_pd(*(BO - 10)); \ + ymm5 = _mm256_set1_pd(*(BO - 9)); \ + ymm10 += ymm0 * ymm2; \ + ymm11 += ymm1 * ymm2; \ + ymm12 += ymm0 * ymm3; \ + ymm13 += ymm1 * ymm3; \ + ymm14 += ymm0 * ymm4; \ + ymm15 += ymm1 * ymm4; \ + ymm16 += ymm0 * ymm5; \ + ymm17 += ymm1 * ymm5; \ + BO += 4; \ + AO += 8; + + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + ymm12 *= ymm0; \ + ymm13 *= ymm0; \ + ymm14 *= ymm0; \ + ymm15 *= ymm0; \ + ymm16 *= ymm0; \ + ymm17 *= ymm0; \ + \ + ymm10 += _mm256_loadu_pd(CO1); \ + ymm11 += _mm256_loadu_pd(CO1 + 4); \ + ymm12 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm13 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + ymm14 += _mm256_loadu_pd(CO1 + (ldc*2)); \ + ymm15 += _mm256_loadu_pd(CO1 + (ldc*2) + 4); \ + ymm16 += _mm256_loadu_pd(CO1 + (ldc*3)); \ + ymm17 += _mm256_loadu_pd(CO1 + (ldc*3) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm10); \ + _mm256_storeu_pd(CO1 + 4, ymm11); \ + _mm256_storeu_pd(CO1 + ldc, ymm12); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm13); \ + _mm256_storeu_pd(CO1 + ldc*2, ymm14); \ + _mm256_storeu_pd(CO1 + ldc*2 + 4, ymm15); \ + _mm256_storeu_pd(CO1 + ldc*3, ymm16); \ + _mm256_storeu_pd(CO1 + ldc*3 + 4, ymm17); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT8x2() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 += ymm0 * ymm2; \ + ymm5 += ymm1 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm7 += ymm1 * ymm3; \ + BO += 2; \ + AO += 8; + + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + ymm5 += _mm256_loadu_pd(CO1 + 4); \ + ymm6 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm4); \ + _mm256_storeu_pd(CO1 + 4, ymm5); \ + _mm256_storeu_pd(CO1 + ldc, ymm6); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm7); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT4x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_loadu_pd(AO - 14); \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm5 += xmm1 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm1 * xmm3; \ + BO += 2; \ + AO += 4; + + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm5 += _mm_loadu_pd(CO1 + 2); \ + xmm6 += _mm_loadu_pd(CO1 + (ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (ldc) + 2); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + 2, xmm5); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + _mm_storeu_pd(CO1 + ldc + 2, xmm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + + + +#define KERNEL2x2_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + BO += 2; \ + AO += 2; + + +#define SAVE2x2(ALPHA) \ + if (ALPHA != 1.0) { \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm6 *= xmm0; \ + } \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm6 += _mm_loadu_pd(CO1 + ldc); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x2() \ + dbl4 = 0; \ + dbl5 = 0; + + +#define KERNEL1x2_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + \ + \ + CO1 += 1; + + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT4x1() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); + + +#define KERNEL4x1() \ + ymm0 = _mm256_set1_pd(*(BO - 12)); \ + ymm1 = _mm256_set1_pd(*(BO - 11)); \ + ymm2 = _mm256_set1_pd(*(BO - 10)); \ + ymm3 = _mm256_set1_pd(*(BO - 9)); \ + \ + ymm4 += _mm256_loadu_pd(AO - 16) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO - 12) * ymm1; \ + \ + ymm0 = _mm256_set1_pd(*(BO - 8)); \ + ymm1 = _mm256_set1_pd(*(BO - 7)); \ + \ + ymm6 += _mm256_loadu_pd(AO - 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO - 4) * ymm3; \ + \ + ymm2 = _mm256_set1_pd(*(BO - 6)); \ + ymm3 = _mm256_set1_pd(*(BO - 5)); \ + \ + ymm4 += _mm256_loadu_pd(AO + 0) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO + 4) * ymm1; \ + ymm6 += _mm256_loadu_pd(AO + 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO + 12) * ymm3; \ + \ + BO += 8; \ + AO += 32; + + +#define INIT8x1() \ + zmm4 = _mm512_setzero_pd(); \ + + +#define KERNEL8x1_SUB() \ + zmm2 = _mm512_set1_pd(*(BO - 12)); \ + zmm0 = _mm512_loadu_pd(AO - 16); \ + zmm4 += zmm0 * zmm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + zmm0 = _mm512_set1_pd(ALPHA); \ + zmm4 *= zmm0; \ + \ + zmm4 += _mm512_loadu_pd(CO1); \ + _mm512_storeu_pd(CO1, zmm4); \ + CO1 += 8; + +#define KERNEL4x1_SUB() \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 += ymm5; \ + ymm6 += ymm7; \ + ymm4 += ymm6; \ + ymm4 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + _mm256_storeu_pd(CO1, ymm4); \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x1() \ + xmm4 = _mm_setzero_pd(); + + +#define KERNEL2x1_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm4 += xmm0 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x1() \ + dbl4 = 0; + +#define KERNEL1x1_SUB() \ + dbl1 = *(BO - 12); \ + dbl0 = *(AO - 16); \ + dbl4 += dbl0 * dbl1; \ + BO += 1; \ + AO += 1; + +#define SAVE1x1(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl4 += *CO1; \ + *CO1 = dbl4; \ + CO1 += 1; + + +/*******************************************************************************************/ + +/* START */ + + +int __attribute__ ((noinline)) +dgemm_kernel(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) +{ + unsigned long M=m, N=n, K=k; + + + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + while (N >= 8) { + double *CO1; + double *AO; + int i; + + CO1 = C; + C += 8 * ldc; + + AO = A + 16; + + i = m; + + while (i >= 8) { + double *BO; + int kloop = K; + + BO = B + 12; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vbroadcastsd (%[alpha]), %%zmm9\n" + "jmp .label1\n" + ".align 32\n" + /* Inner math loop */ + ".label1:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vfmadd231pd -96(%[BO])%{1to8%}, %%zmm0, %%zmm1\n" + "vfmadd231pd -88(%[BO])%{1to8%}, %%zmm0, %%zmm2\n" + "vfmadd231pd -80(%[BO])%{1to8%}, %%zmm0, %%zmm3\n" + "vfmadd231pd -72(%[BO])%{1to8%}, %%zmm0, %%zmm4\n" + "vfmadd231pd -64(%[BO])%{1to8%}, %%zmm0, %%zmm5\n" + "vfmadd231pd -56(%[BO])%{1to8%}, %%zmm0, %%zmm6\n" + "vfmadd231pd -48(%[BO])%{1to8%}, %%zmm0, %%zmm7\n" + "vfmadd231pd -40(%[BO])%{1to8%}, %%zmm0, %%zmm8\n" + "add $64, %[AO]\n" + "add $64, %[BO]\n" + "subl $1, %[kloop]\n" + "jg .label1\n" + /* multiply the result by alpha */ + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + "prefetchw 64(%[C0])\n" + "prefetchw 64(%[C1])\n" + "prefetchw 64(%[C2])\n" + "prefetchw 64(%[C3])\n" + "prefetchw 64(%[C4])\n" + "prefetchw 64(%[C5])\n" + "prefetchw 64(%[C6])\n" + "prefetchw 64(%[C7])\n" + : + [AO] "+r" (AO), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9" + ); + CO1 += 8; + i-= 8; + } + + + + while (i >= 4) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11; + int kloop = K; + + BO = B + 12; + INIT4x8() + + while (kloop > 0) { + KERNEL4x8_SUB() + kloop--; + } + SAVE4x8(alpha) + i-= 4; + } + + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + int kloop = K; + + BO = B + 12; + INIT2x8() + + while (kloop > 0) { + KERNEL2x8_SUB() + kloop--; + } + SAVE2x8(alpha) + i -= 2; + } + + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8, dbl9, dbl10, dbl11; + int kloop = K; + + BO = B + 12; + INIT1x8() + + while (kloop > 0) { + KERNEL1x8_SUB() + kloop--; + } + SAVE1x8(alpha) + i -= 1; + } + B += K * 8; + N -= 8; + } + + if (N == 0) + return 0; + + + + // L8_0 + while (N >= 4) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + // L8_11 + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm10, ymm11,ymm12,ymm13,ymm14,ymm15,ymm16,ymm17; + BO = B + 12; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + + i -= 8; + } + while (i >= 4) { + // L8_11 + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + BO = B + 12; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + BO = B; + BO += 12; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8; + int kloop = K; + BO = B + 12; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + + i-=8; + } + + while (i >= 4) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm3, xmm4, xmm6; + int kloop = K; + BO = B + 12; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl4, dbl5; + int kloop = K; + BO = B + 12; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + double *CO1; + double *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m512d zmm0, zmm2, zmm4; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + + i-= 8; + } + while (i >= 4) { + double *BO; + __m256d ymm0, ymm2, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm4; + int kloop = K; + BO = B; + BO += 12; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl4; + int kloop = K; + + BO = B; + BO += 12; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} From 3439158dea277d132b3804c245cba1f09b4329dd Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 3 Oct 2018 21:20:50 +0200 Subject: [PATCH 050/236] address #1782 2nd loop --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 0019253c0..4a8e6c067 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2587,20 +2587,20 @@ void *blas_memory_alloc(int procpos){ position = 0; + LOCK_COMMAND(&alloc_lock); do { /* if (!memory[position].used) { */ - LOCK_COMMAND(&alloc_lock); /* blas_lock(&memory[position].lock);*/ if (!memory[position].used) goto allocation; - UNLOCK_COMMAND(&alloc_lock); /* blas_unlock(&memory[position].lock);*/ /* } */ position ++; } while (position < NUM_BUFFERS); + UNLOCK_COMMAND(&alloc_lock); goto error; From 591cca7cb05486320230ff8f09255a8d300c20ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Oct 2018 07:35:30 +0200 Subject: [PATCH 051/236] Check availability of immintrin.h in the AVX512 compatibility test --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 66acf1cad..9dc237beb 100644 --- a/c_check +++ b/c_check @@ -205,7 +205,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "int main(void){ __asm__ volatile($code); }\n"; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; From 4c3643ed7f50f13df5efe637f05ffbc705e1860a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Oct 2018 07:36:49 +0200 Subject: [PATCH 052/236] Check availability of immintrin.h in the AVX512 compatibility test --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d339a755f..fe30c7600 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -67,7 +67,7 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") From b095f2fad651d3134b5760d25acda686f3a831b7 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Thu, 4 Oct 2018 12:27:44 +0200 Subject: [PATCH 053/236] Fix unknown type name __WAIT_STATUS on RHEL5 With glibc 2.5 one must have #define _XOPEN_SOURCE >= 500 to use wait. But reading glibc code this is actually needed only if stdlib.h was included before sys/wait.h. This was the case here through openblas_utest.h. So changing include fix compilation on RHEL5 and should ne hurt with more recent distro. * Problem found when using with gcc 5.5 and 4.7.2 on RHEL5/CENTOS5 * Fix #1519 --- utest/test_fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/test_fork.c b/utest/test_fork.c index 9fc51287c..0b90407b1 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -#include "openblas_utest.h" #include #include #include +#include "openblas_utest.h" void* xmalloc(size_t n) { From b7496c36384a681428e60993c2cd7c721ca4dfe5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Oct 2018 19:14:59 +0200 Subject: [PATCH 054/236] Function name needs to be CNAME, set from outside to allow suffixing for dynamic_arch --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 4162611ff..8d0205c5a 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int __attribute__ ((noinline)) -dgemm_kernel(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) { unsigned long M=m, N=n, K=k; From c3e0f0eb3865c372b112a2449fc04d84a1f36515 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Fri, 5 Oct 2018 15:41:52 +0300 Subject: [PATCH 055/236] update travis alpine chroot with avx512 intrinsics headers --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4a25e7121..6e27a6fe4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -85,8 +85,8 @@ jobs: sudo: true language: minimal before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ - && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" + - "wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' From bda3dbe2eb8fb837330d9b5f501ad1eaed81d437 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Fri, 5 Oct 2018 15:47:55 +0300 Subject: [PATCH 056/236] update travis alpine chroot with avx512 intrinsics headers --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6e27a6fe4..a0af0472e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -85,7 +85,7 @@ jobs: sudo: true language: minimal before_install: - - "wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install \ + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: From 1938819c25d7dd4ba995900797f5123e4cfd6fa4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 5 Oct 2018 11:49:43 +0000 Subject: [PATCH 057/236] skylake dgemm: Add a 16x8 kernel The next step for the avx512 dgemm code is adding a 16x8 kernel. In the 8x8 kernel, each FMA has a matching load (the broadcast); in the 16x8 kernel we can reuse this load for 2 FMAs, which in turn reduces pressure on the load ports of the CPU and gives a nice performance boost (in the 25% range). --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 157 +++++++++++++++++++++- 1 file changed, 155 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 8d0205c5a..09d48f99a 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -849,16 +849,169 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, i = m; + while (i >= 16) { + double *BO; + double *A1; + int kloop = K; + + BO = B + 12; + A1 = AO + 8 * K; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "jmp .label16\n" + ".align 32\n" + /* Inner math loop */ + ".label16:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label16\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + "vmulpd %%zmm9, %%zmm11, %%zmm11\n" + "vmulpd %%zmm9, %%zmm12, %%zmm12\n" + "vmulpd %%zmm9, %%zmm13, %%zmm13\n" + "vmulpd %%zmm9, %%zmm14, %%zmm14\n" + "vmulpd %%zmm9, %%zmm15, %%zmm15\n" + "vmulpd %%zmm9, %%zmm16, %%zmm16\n" + "vmulpd %%zmm9, %%zmm17, %%zmm17\n" + "vmulpd %%zmm9, %%zmm18, %%zmm18\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" + "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" + "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" + "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" + "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" + "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" + "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" + "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18" + ); + CO1 += 16; + AO += 8 * K; + i-= 16; + } + while (i >= 8) { double *BO; int kloop = K; BO = B + 12; /* - * This is the inner loop for the hot hot path + * This is the inner loop for the hot hot path * Written in inline asm because compilers like GCC 8 and earlier * struggle with register allocation and are not good at using - * the AVX512 built in broadcast ability (1to8) + * the AVX512 built in broadcast ability (1to8) */ asm( "vxorpd %%zmm1, %%zmm1, %%zmm1\n" From 66b43affbc24a69e841930d18c30758542aa381c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 5 Oct 2018 13:22:21 +0000 Subject: [PATCH 058/236] Add a 24x8 kernel to the skylakex dgemm implementation Minor gains for small matrixes, but at 512x512 and above the gain gets more significant. --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 201 ++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 09d48f99a..293bd4a99 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -849,6 +849,207 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, i = m; + while (i >= 24) { + double *BO; + double *A1, *A2; + int kloop = K; + + BO = B + 12; + A1 = AO + 8 * K; + A2 = AO + 16 * K; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "vmovapd %%zmm1, %%zmm21\n" + "vmovapd %%zmm1, %%zmm22\n" + "vmovapd %%zmm1, %%zmm23\n" + "vmovapd %%zmm1, %%zmm24\n" + "vmovapd %%zmm1, %%zmm25\n" + "vmovapd %%zmm1, %%zmm26\n" + "vmovapd %%zmm1, %%zmm27\n" + "vmovapd %%zmm1, %%zmm28\n" + "jmp .label24\n" + ".align 32\n" + /* Inner math loop */ + ".label24:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + "vmovupd -128(%[A2]),%%zmm20\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm21\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm22\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm23\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm24\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm25\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm26\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm27\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm28\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[A2]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[A2])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label24\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + "vmulpd %%zmm9, %%zmm11, %%zmm11\n" + "vmulpd %%zmm9, %%zmm12, %%zmm12\n" + "vmulpd %%zmm9, %%zmm13, %%zmm13\n" + "vmulpd %%zmm9, %%zmm14, %%zmm14\n" + "vmulpd %%zmm9, %%zmm15, %%zmm15\n" + "vmulpd %%zmm9, %%zmm16, %%zmm16\n" + "vmulpd %%zmm9, %%zmm17, %%zmm17\n" + "vmulpd %%zmm9, %%zmm18, %%zmm18\n" + "vmulpd %%zmm9, %%zmm21, %%zmm21\n" + "vmulpd %%zmm9, %%zmm22, %%zmm22\n" + "vmulpd %%zmm9, %%zmm23, %%zmm23\n" + "vmulpd %%zmm9, %%zmm24, %%zmm24\n" + "vmulpd %%zmm9, %%zmm25, %%zmm25\n" + "vmulpd %%zmm9, %%zmm26, %%zmm26\n" + "vmulpd %%zmm9, %%zmm27, %%zmm27\n" + "vmulpd %%zmm9, %%zmm28, %%zmm28\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" + "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" + "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" + "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" + "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" + "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" + "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" + "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + "vaddpd 128(%[C0]), %%zmm21, %%zmm21\n" + "vaddpd 128(%[C1]), %%zmm22, %%zmm22\n" + "vaddpd 128(%[C2]), %%zmm23, %%zmm23\n" + "vaddpd 128(%[C3]), %%zmm24, %%zmm24\n" + "vaddpd 128(%[C4]), %%zmm25, %%zmm25\n" + "vaddpd 128(%[C5]), %%zmm26, %%zmm26\n" + "vaddpd 128(%[C6]), %%zmm27, %%zmm27\n" + "vaddpd 128(%[C7]), %%zmm28, %%zmm28\n" + "vmovupd %%zmm21, 128(%[C0])\n" + "vmovupd %%zmm22, 128(%[C1])\n" + "vmovupd %%zmm23, 128(%[C2])\n" + "vmovupd %%zmm24, 128(%[C3])\n" + "vmovupd %%zmm25, 128(%[C4])\n" + "vmovupd %%zmm26, 128(%[C5])\n" + "vmovupd %%zmm27, 128(%[C6])\n" + "vmovupd %%zmm28, 128(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [A2] "+r" (A2), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28" + ); + CO1 += 24; + AO += 16 * K; + i-= 24; + } + + while (i >= 16) { double *BO; double *A1; From 79ea839b635d1fd84b6ce8a47e086f01d64198e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiziano=20M=C3=BCller?= Date: Sat, 6 Oct 2018 14:10:02 +0200 Subject: [PATCH 059/236] fix parallel build issues with APFS/HFS+/ext2/3 in netlib-lapack The problem is that OpenBLAS sets the LAPACKE_LIB and the TMGLIB to the same object and uses the `ar` feature to update the archive file. If the underlying filesystem does not have sub-second timestamp resolution and the system is fast enough (or `ccache` is used), the timestamp of the builds which should be added to the previously generated archive is the same as the archive file itself and therefore `make` does not update the archive. Since OpenBLAS takes care to not run the different targets updating the archive in parallel, the easiest solution is to declare the respective targets `.PHONY`, forcing `make` to always update them. fixes #1682 --- lapack-netlib/LAPACKE/src/Makefile | 2 ++ lapack-netlib/SRC/Makefile | 2 ++ lapack-netlib/TESTING/MATGEN/Makefile | 2 ++ 3 files changed, 6 insertions(+) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 44884d4a5..7672f9f73 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -2454,6 +2454,8 @@ endif all: ../../$(LAPACKELIB) +.PHONY: ../../$(LAPACKELIB) + ../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN) $(ARCH) $(ARCHFLAGS) $@ $(OBJ_A) $(ARCH) $(ARCHFLAGS) $@ $(OBJ_B) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 531cb51fc..87a8f51e4 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -552,6 +552,8 @@ endif all: ../$(LAPACKLIB) +.PHONY: ../$(LAPACKLIB) + ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) $@ diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index e20004c2f..a1d784fa5 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -57,6 +57,8 @@ all: ../../$(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) +.PHONY: ../../$(TMGLIB) + ../../$(TMGLIB): $(ALLOBJ) $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ From 474f7e9583a85630345458abb71b7246def3f10f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Oct 2018 14:28:04 +0200 Subject: [PATCH 060/236] Add SYMBOLPREFIX and -SUFFIX options and improve help output --- CMakeLists.txt | 114 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 96 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97c3b7777..ca951d401 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,16 +15,21 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(OpenBLAS_LIBNAME openblas) - ####### if(MSVC) -option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) +option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() -option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) -option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) -option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) -option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) +option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) +option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) +option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) + +# Add a prefix or suffix to all exported symbol names in the shared library. +# Avoids conflicts with other BLAS libraries, especially when using +# 64 bit integer interfaces in OpenBLAS. + +set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) +set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) ####### if(BUILD_WITHOUT_LAPACK) set(NO_LAPACK 1) @@ -38,11 +43,13 @@ endif() ####### -message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") +message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") +set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) + set(BLASDIRS interface driver/level2 driver/level3 driver/others) if (NOT DYNAMIC_ARCH) @@ -210,15 +217,84 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES SOVERSION ${OpenBLAS_MAJOR_VERSION} ) +if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") +if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") +else() + set(ARCH_IN ${ARCH}) +endif() + +if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") +endif () + +if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) +else() + set(EXPRECISION_IN ${EXPRECISION}) +endif() + +if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) +else() + set(NO_CBLAS_IN ${NO_CBLAS}) +endif() + +if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) +else() + set(NO_LAPACK_IN ${NO_LAPACK}) +endif() + +if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) +else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) +endif() + +if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) +else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) +endif() + +if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) +else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) +endif() + +if (NOT DEFINED BU) + set(BU _) +endif() + +if (NOT ${SYMBOLPREFIX} STREQUAL "") +message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") +endif() +if (NOT ${SYMBOLSUFFIX} STREQUAL "") +message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") +endif() + add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMENT "renaming symbols" + ) +endif() + + # Install project # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} - EXPORT "OpenBLASTargets" + EXPORT "OpenBLAS${SUFFIX64}Targets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +# Install headers +set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) +set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) + message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) @@ -266,29 +342,31 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() include(FindPkgConfig QUIET) if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) + configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) + install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(PN OpenBLAS) -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") configure_package_config_file(cmake/${PN}Config.cmake.in - "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake VERSION ${${PN}_VERSION} COMPATIBILITY AnyNewerVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) -install(EXPORT "${PN}Targets" - NAMESPACE "${PN}::" +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + RENAME ${PN}${SUFFIX64}ConfigVersion.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(EXPORT "${PN}${SUFFIX64}Targets" + NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) From d74dc39b0faeebb7aeb97e4099dcb50a1fcc7533 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 13:47:20 +0000 Subject: [PATCH 061/236] Add optimized *copy versions for skylakex Add optimized n/t copy versions for skylakex; in the patch the tcopy is also rewritten using intrinsics; the ncopy file will be worked on in a future commit --- kernel/x86_64/KERNEL.SKYLAKEX | 8 +- kernel/x86_64/dgemm_ncopy_8_skylakex.c | 422 +++++++++++++++++++++++++ kernel/x86_64/dgemm_tcopy_8_skylakex.c | 417 ++++++++++++++++++++++++ 3 files changed, 843 insertions(+), 4 deletions(-) create mode 100644 kernel/x86_64/dgemm_ncopy_8_skylakex.c create mode 100644 kernel/x86_64/dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index ba149512d..e34cda770 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -4,10 +4,10 @@ SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_8.c -DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c +DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c new file mode 100644 index 000000000..3bc55b8cc --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -0,0 +1,422 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + *(boffset + 8) = ctemp02; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp18; + *(boffset + 11) = ctemp26; + *(boffset + 12) = ctemp34; + *(boffset + 13) = ctemp42; + *(boffset + 14) = ctemp50; + *(boffset + 15) = ctemp58; + + *(boffset + 16) = ctemp03; + *(boffset + 17) = ctemp11; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp27; + *(boffset + 20) = ctemp35; + *(boffset + 21) = ctemp43; + *(boffset + 22) = ctemp51; + *(boffset + 23) = ctemp59; + + *(boffset + 24) = ctemp04; + *(boffset + 25) = ctemp12; + *(boffset + 26) = ctemp20; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp36; + *(boffset + 29) = ctemp44; + *(boffset + 30) = ctemp52; + *(boffset + 31) = ctemp60; + + *(boffset + 32) = ctemp05; + *(boffset + 33) = ctemp13; + *(boffset + 34) = ctemp21; + *(boffset + 35) = ctemp29; + *(boffset + 36) = ctemp37; + *(boffset + 37) = ctemp45; + *(boffset + 38) = ctemp53; + *(boffset + 39) = ctemp61; + + *(boffset + 40) = ctemp06; + *(boffset + 41) = ctemp14; + *(boffset + 42) = ctemp22; + *(boffset + 43) = ctemp30; + *(boffset + 44) = ctemp38; + *(boffset + 45) = ctemp46; + *(boffset + 46) = ctemp54; + *(boffset + 47) = ctemp62; + + *(boffset + 48) = ctemp07; + *(boffset + 49) = ctemp15; + *(boffset + 50) = ctemp23; + *(boffset + 51) = ctemp31; + *(boffset + 52) = ctemp39; + *(boffset + 53) = ctemp47; + *(boffset + 54) = ctemp55; + *(boffset + 55) = ctemp63; + + *(boffset + 56) = ctemp08; + *(boffset + 57) = ctemp16; + *(boffset + 58) = ctemp24; + *(boffset + 59) = ctemp32; + *(boffset + 60) = ctemp40; + *(boffset + 61) = ctemp48; + *(boffset + 62) = ctemp56; + *(boffset + 63) = ctemp64; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp13; + + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp10; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp07; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp15; + + *(boffset + 12) = ctemp04; + *(boffset + 13) = ctemp08; + *(boffset + 14) = ctemp12; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/x86_64/dgemm_tcopy_8_skylakex.c b/kernel/x86_64/dgemm_tcopy_8_skylakex.c new file mode 100644 index 000000000..472ad6349 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_8_skylakex.c @@ -0,0 +1,417 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + row3 = _mm512_loadu_pd(aoffset3); + aoffset3 += 8; + row4 = _mm512_loadu_pd(aoffset4); + aoffset4 += 8; + row5 = _mm512_loadu_pd(aoffset5); + aoffset5 += 8; + row6 = _mm512_loadu_pd(aoffset6); + aoffset6 += 8; + row7 = _mm512_loadu_pd(aoffset7); + aoffset7 += 8; + row8 = _mm512_loadu_pd(aoffset8); + aoffset8 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + _mm512_storeu_pd(boffset1 + 16, row3); + _mm512_storeu_pd(boffset1 + 24, row4); + _mm512_storeu_pd(boffset1 + 32, row5); + _mm512_storeu_pd(boffset1 + 40, row6); + _mm512_storeu_pd(boffset1 + 48, row7); + _mm512_storeu_pd(boffset1 + 56, row8); + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + row3 = _mm256_loadu_pd(aoffset3); + aoffset3 += 4; + row4 = _mm256_loadu_pd(aoffset4); + aoffset4 += 4; + row5 = _mm256_loadu_pd(aoffset5); + aoffset5 += 4; + row6 = _mm256_loadu_pd(aoffset6); + aoffset6 += 4; + row7 = _mm256_loadu_pd(aoffset7); + aoffset7 += 4; + row8 = _mm256_loadu_pd(aoffset8); + aoffset8 += 4; + + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + _mm256_storeu_pd(boffset2 + 8, row3); + _mm256_storeu_pd(boffset2 + 12, row4); + _mm256_storeu_pd(boffset2 + 16, row5); + _mm256_storeu_pd(boffset2 + 20, row6); + _mm256_storeu_pd(boffset2 + 24, row7); + _mm256_storeu_pd(boffset2 + 28, row8); + boffset2 += 32; + } + + if (n & 2){ + __m128d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + row3 = _mm_loadu_pd(aoffset3); + aoffset3 += 2; + + row4 = _mm_loadu_pd(aoffset4); + aoffset4 += 2; + + row5 = _mm_loadu_pd(aoffset5); + aoffset5 += 2; + + row6 = _mm_loadu_pd(aoffset6); + aoffset6 += 2; + + row7 = _mm_loadu_pd(aoffset7); + aoffset7 += 2; + + row8 = _mm_loadu_pd(aoffset8); + aoffset8 += 2; + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + _mm_storeu_pd(boffset3 + 4, row3); + _mm_storeu_pd(boffset3 + 6, row4); + _mm_storeu_pd(boffset3 + 8, row5); + _mm_storeu_pd(boffset3 + 10, row6); + _mm_storeu_pd(boffset3 + 12, row7); + _mm_storeu_pd(boffset3 + 14, row8); + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + *(boffset4 + 4) = ctemp05; + *(boffset4 + 5) = ctemp06; + *(boffset4 + 6) = ctemp07; + *(boffset4 + 7) = ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + __m512d row1, row2, row3, row4; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + row3 = _mm512_loadu_pd(aoffset3); + aoffset3 += 8; + row4 = _mm512_loadu_pd(aoffset4); + aoffset4 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + _mm512_storeu_pd(boffset1 + 16, row3); + _mm512_storeu_pd(boffset1 + 24, row4); + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + __m256d row1, row2, row3, row4; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + row3 = _mm256_loadu_pd(aoffset3); + aoffset3 += 4; + row4 = _mm256_loadu_pd(aoffset4); + aoffset4 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + _mm256_storeu_pd(boffset2 + 8, row3); + _mm256_storeu_pd(boffset2 + 12, row4); + boffset2 += 16; + } + + if (n & 2){ + __m128d row1, row2, row3, row4; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + row3 = _mm_loadu_pd(aoffset3); + aoffset3 += 2; + + row4 = _mm_loadu_pd(aoffset4); + aoffset4 += 2; + + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + _mm_storeu_pd(boffset3 + 4, row3); + _mm_storeu_pd(boffset3 + 6, row4); + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1, row2; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1, row2; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + boffset2 += 8; + } + + if (n & 2){ + __m128d row1, row2; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + // aoffset += lda; + + boffset1 = boffset; + // boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + // boffset2 += 4; + } + + if (n & 2){ + __m128d row1; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + _mm_storeu_pd(boffset3 + 0, row1); + + // boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = ctemp01; + boffset4 ++; + } + } + + return 0; +} From 6d43c51ccf7de3d0f41c2e2b382ada07159cf599 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 14:00:37 +0000 Subject: [PATCH 062/236] undo slow dgemm/skylake microoptimization the compare is more costly than the work --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 293bd4a99..b5693ea2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -647,11 +647,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SAVE2x2(ALPHA) \ - if (ALPHA != 1.0) { \ - xmm0 = _mm_set1_pd(ALPHA); \ - xmm4 *= xmm0; \ - xmm6 *= xmm0; \ - } \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm6 *= xmm0; \ \ xmm4 += _mm_loadu_pd(CO1); \ xmm6 += _mm_loadu_pd(CO1 + ldc); \ From 20c5d668fe316d6f431a34f8734600194644e736 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 14:12:32 +0000 Subject: [PATCH 063/236] dgemm/avx512 simplify and speed up the 4x4 kernel --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 26 ++++------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index b5693ea2c..bb121ca69 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -333,17 +333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define KERNEL4x4_SUB() \ ymm0 = _mm256_loadu_pd(AO - 16); \ - ymm1 = _mm256_loadu_pd(BO - 12); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 12)); \ \ ymm4 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 11)); \ ymm5 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 10)); \ ymm6 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 9)); \ ymm7 += ymm0 * ymm1; \ AO += 4; \ BO += 4; @@ -356,24 +356,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ymm6 *= ymm0; \ ymm7 *= ymm0; \ \ - ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ - ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ - \ - ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ - ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ - ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ - ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ - \ - ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ - ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ - ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ - ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ - \ - ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ - ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ - ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ - ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ - \ ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ From 32bec8afbbdb94df4e5a4b127fa8aa5857fccc54 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 16:36:26 +0000 Subject: [PATCH 064/236] add a skylakex optimized dgemm beta function --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- kernel/x86_64/dgemm_beta_skylakex.c | 150 ++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_beta_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index e34cda770..48c81e80b 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,4 +10,4 @@ DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c SGEMM_BETA = ../generic/gemm_beta.c -DGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c new file mode 100644 index 000000000..384e9f60b --- /dev/null +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + /* fast path.. just zero the whole matrix */ + if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + memset(c, 0, m * n * sizeof(FLOAT)); + return 0; + } + + + c_offset = c; + + if (beta == ZERO){ + __m512d z_zero; + + z_zero = _mm512_setzero_pd(); + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = m; + + while (i > 32) { + _mm512_storeu_pd(c_offset1, z_zero); + _mm512_storeu_pd(c_offset1 + 8, z_zero); + _mm512_storeu_pd(c_offset1 + 16, z_zero); + _mm512_storeu_pd(c_offset1 + 24 , z_zero); + c_offset1 += 32; + i -= 32; + } + while (i > 8) { + _mm512_storeu_pd(c_offset1, z_zero); + c_offset1 += 8; + i -= 8; + } + + while (i > 0) { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; From adbf6afa25ca5383d48df296262bb4f2bfc0e311 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 21:18:12 +0000 Subject: [PATCH 065/236] Add vector optimizations for ncopy as well for dgemm/skylakex --- kernel/x86_64/dgemm_ncopy_8_skylakex.c | 201 ++++++++++++------------- 1 file changed, 100 insertions(+), 101 deletions(-) diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c index 3bc55b8cc..74b336f3d 100644 --- a/kernel/x86_64/dgemm_ncopy_8_skylakex.c +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -38,6 +38,7 @@ #include #include "common.h" +#include int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ BLASLONG i, j; @@ -84,131 +85,129 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ i = (m >> 3); if (i > 0){ do{ - ctemp01 = *(aoffset1 + 0); - ctemp02 = *(aoffset1 + 1); - ctemp03 = *(aoffset1 + 2); - ctemp04 = *(aoffset1 + 3); - ctemp05 = *(aoffset1 + 4); - ctemp06 = *(aoffset1 + 5); + __m128d xmm0, xmm1; + xmm0 = _mm_load_pd1(aoffset2 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 0); + _mm_storeu_pd(boffset + 0, xmm0); + ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - ctemp09 = *(aoffset2 + 0); - ctemp10 = *(aoffset2 + 1); - ctemp11 = *(aoffset2 + 2); - ctemp12 = *(aoffset2 + 3); - ctemp13 = *(aoffset2 + 4); - ctemp14 = *(aoffset2 + 5); + xmm1 = _mm_load_pd1(aoffset4 + 0); + xmm1 = _mm_loadl_pd(xmm1, aoffset3 + 0); + _mm_storeu_pd(boffset + 2, xmm1); + + xmm0 = _mm_load_pd1(aoffset6 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 0); + _mm_storeu_pd(boffset + 4, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 0); + _mm_storeu_pd(boffset + 6, xmm0); + ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - ctemp17 = *(aoffset3 + 0); - ctemp18 = *(aoffset3 + 1); - ctemp19 = *(aoffset3 + 2); - ctemp20 = *(aoffset3 + 3); - ctemp21 = *(aoffset3 + 4); - ctemp22 = *(aoffset3 + 5); + xmm0 = _mm_load_pd1(aoffset2 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 1); + _mm_storeu_pd(boffset + 8, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 1); + _mm_storeu_pd(boffset + 10, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 1); + _mm_storeu_pd(boffset + 12, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 1); + _mm_storeu_pd(boffset + 14, xmm0); + + xmm0 = _mm_load_pd1(aoffset2 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 2); + _mm_storeu_pd(boffset + 16, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 2); + _mm_storeu_pd(boffset + 18, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 2); + _mm_storeu_pd(boffset + 20, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 2); + _mm_storeu_pd(boffset + 22, xmm0); + ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); - ctemp25 = *(aoffset4 + 0); - ctemp26 = *(aoffset4 + 1); - ctemp27 = *(aoffset4 + 2); - ctemp28 = *(aoffset4 + 3); - ctemp29 = *(aoffset4 + 4); - ctemp30 = *(aoffset4 + 5); + xmm0 = _mm_load_pd1(aoffset2 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 3); + _mm_storeu_pd(boffset + 24, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 3); + _mm_storeu_pd(boffset + 26, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 3); + _mm_storeu_pd(boffset + 28, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 3); + _mm_storeu_pd(boffset + 30, xmm0); + ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); - ctemp33 = *(aoffset5 + 0); - ctemp34 = *(aoffset5 + 1); - ctemp35 = *(aoffset5 + 2); - ctemp36 = *(aoffset5 + 3); - ctemp37 = *(aoffset5 + 4); - ctemp38 = *(aoffset5 + 5); + + xmm0 = _mm_load_pd1(aoffset2 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 4); + _mm_storeu_pd(boffset + 32, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 4); + _mm_storeu_pd(boffset + 34, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 4); + _mm_storeu_pd(boffset + 36, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 4); + _mm_storeu_pd(boffset + 38, xmm0); + ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); - ctemp41 = *(aoffset6 + 0); - ctemp42 = *(aoffset6 + 1); - ctemp43 = *(aoffset6 + 2); - ctemp44 = *(aoffset6 + 3); - ctemp45 = *(aoffset6 + 4); - ctemp46 = *(aoffset6 + 5); + xmm0 = _mm_load_pd1(aoffset2 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 5); + _mm_storeu_pd(boffset + 40, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 5); + _mm_storeu_pd(boffset + 42, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 5); + _mm_storeu_pd(boffset + 44, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 5); + _mm_storeu_pd(boffset + 46, xmm0); + + ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); - ctemp49 = *(aoffset7 + 0); - ctemp50 = *(aoffset7 + 1); - ctemp51 = *(aoffset7 + 2); - ctemp52 = *(aoffset7 + 3); - ctemp53 = *(aoffset7 + 4); - ctemp54 = *(aoffset7 + 5); ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); - ctemp57 = *(aoffset8 + 0); - ctemp58 = *(aoffset8 + 1); - ctemp59 = *(aoffset8 + 2); - ctemp60 = *(aoffset8 + 3); - ctemp61 = *(aoffset8 + 4); - ctemp62 = *(aoffset8 + 5); ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); - *(boffset + 0) = ctemp01; - *(boffset + 1) = ctemp09; - *(boffset + 2) = ctemp17; - *(boffset + 3) = ctemp25; - *(boffset + 4) = ctemp33; - *(boffset + 5) = ctemp41; - *(boffset + 6) = ctemp49; - *(boffset + 7) = ctemp57; - - *(boffset + 8) = ctemp02; - *(boffset + 9) = ctemp10; - *(boffset + 10) = ctemp18; - *(boffset + 11) = ctemp26; - *(boffset + 12) = ctemp34; - *(boffset + 13) = ctemp42; - *(boffset + 14) = ctemp50; - *(boffset + 15) = ctemp58; - - *(boffset + 16) = ctemp03; - *(boffset + 17) = ctemp11; - *(boffset + 18) = ctemp19; - *(boffset + 19) = ctemp27; - *(boffset + 20) = ctemp35; - *(boffset + 21) = ctemp43; - *(boffset + 22) = ctemp51; - *(boffset + 23) = ctemp59; - - *(boffset + 24) = ctemp04; - *(boffset + 25) = ctemp12; - *(boffset + 26) = ctemp20; - *(boffset + 27) = ctemp28; - *(boffset + 28) = ctemp36; - *(boffset + 29) = ctemp44; - *(boffset + 30) = ctemp52; - *(boffset + 31) = ctemp60; - - *(boffset + 32) = ctemp05; - *(boffset + 33) = ctemp13; - *(boffset + 34) = ctemp21; - *(boffset + 35) = ctemp29; - *(boffset + 36) = ctemp37; - *(boffset + 37) = ctemp45; - *(boffset + 38) = ctemp53; - *(boffset + 39) = ctemp61; - - *(boffset + 40) = ctemp06; - *(boffset + 41) = ctemp14; - *(boffset + 42) = ctemp22; - *(boffset + 43) = ctemp30; - *(boffset + 44) = ctemp38; - *(boffset + 45) = ctemp46; - *(boffset + 46) = ctemp54; - *(boffset + 47) = ctemp62; *(boffset + 48) = ctemp07; *(boffset + 49) = ctemp15; From 582c589727302938e99bf594bf072d3d9913575e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 23:13:26 +0000 Subject: [PATCH 066/236] dgemm/skylakex: replace discrete mul/add with fma very minor gains since it's not super hot code, but general principles --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 155 +++++++--------------- 1 file changed, 49 insertions(+), 106 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index bb121ca69..a83ca98fa 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -927,39 +927,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "jg .label24\n" /* multiply the result by alpha */ "vbroadcastsd (%[alpha]), %%zmm9\n" - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - "vmulpd %%zmm9, %%zmm11, %%zmm11\n" - "vmulpd %%zmm9, %%zmm12, %%zmm12\n" - "vmulpd %%zmm9, %%zmm13, %%zmm13\n" - "vmulpd %%zmm9, %%zmm14, %%zmm14\n" - "vmulpd %%zmm9, %%zmm15, %%zmm15\n" - "vmulpd %%zmm9, %%zmm16, %%zmm16\n" - "vmulpd %%zmm9, %%zmm17, %%zmm17\n" - "vmulpd %%zmm9, %%zmm18, %%zmm18\n" - "vmulpd %%zmm9, %%zmm21, %%zmm21\n" - "vmulpd %%zmm9, %%zmm22, %%zmm22\n" - "vmulpd %%zmm9, %%zmm23, %%zmm23\n" - "vmulpd %%zmm9, %%zmm24, %%zmm24\n" - "vmulpd %%zmm9, %%zmm25, %%zmm25\n" - "vmulpd %%zmm9, %%zmm26, %%zmm26\n" - "vmulpd %%zmm9, %%zmm27, %%zmm27\n" - "vmulpd %%zmm9, %%zmm28, %%zmm28\n" /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -969,14 +945,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" - "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" - "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" - "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" - "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" - "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" - "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" - "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" "vmovupd %%zmm11, 64(%[C0])\n" "vmovupd %%zmm12, 64(%[C1])\n" "vmovupd %%zmm13, 64(%[C2])\n" @@ -986,14 +962,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm17, 64(%[C6])\n" "vmovupd %%zmm18, 64(%[C7])\n" - "vaddpd 128(%[C0]), %%zmm21, %%zmm21\n" - "vaddpd 128(%[C1]), %%zmm22, %%zmm22\n" - "vaddpd 128(%[C2]), %%zmm23, %%zmm23\n" - "vaddpd 128(%[C3]), %%zmm24, %%zmm24\n" - "vaddpd 128(%[C4]), %%zmm25, %%zmm25\n" - "vaddpd 128(%[C5]), %%zmm26, %%zmm26\n" - "vaddpd 128(%[C6]), %%zmm27, %%zmm27\n" - "vaddpd 128(%[C7]), %%zmm28, %%zmm28\n" + "vfmadd213pd 128(%[C0]), %%zmm9, %%zmm21\n" + "vfmadd213pd 128(%[C1]), %%zmm9, %%zmm22\n" + "vfmadd213pd 128(%[C2]), %%zmm9, %%zmm23\n" + "vfmadd213pd 128(%[C3]), %%zmm9, %%zmm24\n" + "vfmadd213pd 128(%[C4]), %%zmm9, %%zmm25\n" + "vfmadd213pd 128(%[C5]), %%zmm9, %%zmm26\n" + "vfmadd213pd 128(%[C6]), %%zmm9, %%zmm27\n" + "vfmadd213pd 128(%[C7]), %%zmm9, %%zmm28\n" "vmovupd %%zmm21, 128(%[C0])\n" "vmovupd %%zmm22, 128(%[C1])\n" "vmovupd %%zmm23, 128(%[C2])\n" @@ -1108,31 +1084,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "jg .label16\n" /* multiply the result by alpha */ "vbroadcastsd (%[alpha]), %%zmm9\n" - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - "vmulpd %%zmm9, %%zmm11, %%zmm11\n" - "vmulpd %%zmm9, %%zmm12, %%zmm12\n" - "vmulpd %%zmm9, %%zmm13, %%zmm13\n" - "vmulpd %%zmm9, %%zmm14, %%zmm14\n" - "vmulpd %%zmm9, %%zmm15, %%zmm15\n" - "vmulpd %%zmm9, %%zmm16, %%zmm16\n" - "vmulpd %%zmm9, %%zmm17, %%zmm17\n" - "vmulpd %%zmm9, %%zmm18, %%zmm18\n" /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -1142,14 +1102,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" - "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" - "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" - "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" - "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" - "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" - "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" - "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" "vmovupd %%zmm11, 64(%[C0])\n" "vmovupd %%zmm12, 64(%[C1])\n" "vmovupd %%zmm13, 64(%[C2])\n" @@ -1221,24 +1181,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "add $64, %[BO]\n" "subl $1, %[kloop]\n" "jg .label1\n" - /* multiply the result by alpha */ - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + /* multiply the result by alpha and add to the memory */ + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -1247,14 +1198,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm6, (%[C5])\n" "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "prefetchw 64(%[C0])\n" - "prefetchw 64(%[C1])\n" - "prefetchw 64(%[C2])\n" - "prefetchw 64(%[C3])\n" - "prefetchw 64(%[C4])\n" - "prefetchw 64(%[C5])\n" - "prefetchw 64(%[C6])\n" - "prefetchw 64(%[C7])\n" : [AO] "+r" (AO), [BO] "+r" (BO), From eba394c711440ab515f80ea01bd4e72342e4719b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Oct 2018 19:18:12 +0200 Subject: [PATCH 067/236] Add -march=skylake-avx512 when required fixes #1797 --- cmake/system_check.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d339a755f..4ec4df416 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -71,6 +71,8 @@ if (X86_64 OR X86) execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +else() +set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() file(REMOVE "avx512.tmp" "avx512.o") endif() From 697dc1baf8fe8f4c8ac0ee8a1f82ee7bad7395e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Oct 2018 22:26:59 +0200 Subject: [PATCH 068/236] Use override for ARCH in make.inc in case a conflicting setting of ARCH (for architecture) gets pulled in from the environment (originally suggested by dloghin in #1753) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b947c1198..8ac77c729 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc From d3d58f8ee538f240b14abb4a9e9beffb8a495415 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Oct 2018 22:29:35 +0200 Subject: [PATCH 069/236] Catch conflicting usage of ARCH in at least some BSD environments fixes #1796 --- Makefile.system | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.system b/Makefile.system index 4712d9525..53537eb09 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,11 @@ ifndef TOPDIR TOPDIR = . endif +# Catch conflicting usage of ARCH in some BSD environments +ifeq ($(ARCH), amd64) +override ARCH=x86_64 +endif + NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # Default C compiler From d4c8853a029175d6064a09341201f776c32440b3 Mon Sep 17 00:00:00 2001 From: fengrl <42458138+fengrl@users.noreply.github.com> Date: Tue, 9 Oct 2018 11:20:16 +0800 Subject: [PATCH 070/236] Update common_mips64.h --- common_mips64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_mips64.h b/common_mips64.h index 93bc7e519..1163413dc 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){ #define RPCC_DEFINED #ifndef NO_AFFINITY -#define WHEREAMI +//#define WHEREAMI static inline int WhereAmI(void){ int ret=0; __asm__ __volatile__(".set push \n" From 6234a326569041cc2f3fa667c6f70402c056237f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 9 Oct 2018 10:31:59 +0200 Subject: [PATCH 071/236] Use cygwin compilation workaround for avx512 on msys2/mingw64 as well --- Makefile.x86_64 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f831b5040..f2647fb7d 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -15,6 +15,11 @@ FCOMMON_OPT += -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif endif endif From d4bad73834a9e1abf23e3c0a8f4e9a84e9137881 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 10 Oct 2018 01:49:22 +0000 Subject: [PATCH 072/236] Add a C+intrinsics version of the SGEMM/skylakex kernel for most sizes this is 1.2x to 1.4x faster than the current code --- kernel/x86_64/sgemm_beta_skylakex.c | 150 ++ kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 1726 ++++++++++++++++++++ kernel/x86_64/sgemm_ncopy_4_skylakex.c | 207 +++ kernel/x86_64/sgemm_tcopy_16_skylakex.c | 387 +++++ 4 files changed, 2470 insertions(+) create mode 100644 kernel/x86_64/sgemm_beta_skylakex.c create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.c create mode 100644 kernel/x86_64/sgemm_ncopy_4_skylakex.c create mode 100644 kernel/x86_64/sgemm_tcopy_16_skylakex.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c new file mode 100644 index 000000000..b1bf4d77a --- /dev/null +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + /* fast path.. just zero the whole matrix */ + if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + memset(c, 0, m * n * sizeof(FLOAT)); + return 0; + } + + + c_offset = c; + + if (beta == ZERO){ + __m512 z_zero; + + z_zero = _mm512_setzero_ps(); + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = m; + + while (i > 32) { + _mm512_storeu_ps(c_offset1, z_zero); + _mm512_storeu_ps(c_offset1 + 8, z_zero); + _mm512_storeu_ps(c_offset1 + 16, z_zero); + _mm512_storeu_ps(c_offset1 + 24 , z_zero); + c_offset1 += 32; + i -= 32; + } + while (i > 8) { + _mm512_storeu_ps(c_offset1, z_zero); + c_offset1 += 8; + i -= 8; + } + + while (i > 0) { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c new file mode 100644 index 000000000..b2b1ab03f --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -0,0 +1,1726 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/* comment below left for history, data does not represent the implementation in this file */ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#include "common.h" +#include + + + +/******************************************************************************************* +* 8 lines of N +*******************************************************************************************/ + + + +#define INIT32x8() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row4 = _mm512_setzero_ps(); \ + row5 = _mm512_setzero_ps(); \ + row6 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row4b = _mm512_setzero_ps(); \ + row5b = _mm512_setzero_ps(); \ + row6b = _mm512_setzero_ps(); \ + row7b = _mm512_setzero_ps(); \ + +#define KERNEL32x8_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm0b = _mm512_loadu_ps(AOb); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm0b * zmm2; \ + row1b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm0b * zmm2; \ + row3b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += zmm0 * zmm2; \ + row5 += zmm0 * zmm3; \ + row4b += zmm0b * zmm2; \ + row5b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += zmm0 * zmm2; \ + row7 += zmm0 * zmm3; \ + row6b += zmm0b * zmm2; \ + row7b += zmm0b * zmm3; \ + BO += 8; \ + AO += 16; \ + AOb += 16; + + +#define SAVE32x8(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row4 *= zmm0; \ + row5 *= zmm0; \ + row6 *= zmm0; \ + row7 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row4b *= zmm0; \ + row5b *= zmm0; \ + row6b *= zmm0; \ + row7b *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm512_storeu_ps(CO1 + 7 * ldc, row7); \ + row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ + row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ + row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ + row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ + row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ + _mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ + _mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ + _mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ + _mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ + _mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ + + +#define INIT16x8() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row4 = _mm512_setzero_ps(); \ + row5 = _mm512_setzero_ps(); \ + row6 = _mm512_setzero_ps(); \ + row7 = _mm512_setzero_ps(); \ + +#define KERNEL16x8_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += zmm0 * zmm2; \ + row5 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += zmm0 * zmm2; \ + row7 += zmm0 * zmm3; \ + BO += 8; \ + AO += 16; + + +#define SAVE16x8(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row4 *= zmm0; \ + row5 *= zmm0; \ + row6 *= zmm0; \ + row7 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm512_storeu_ps(CO1 + 7 * ldc, row7); + + + +/*******************************************************************************************/ + +#define INIT8x8() \ + row0 = _mm256_setzero_ps(); \ + row1 = _mm256_setzero_ps(); \ + row2 = _mm256_setzero_ps(); \ + row3 = _mm256_setzero_ps(); \ + row4 = _mm256_setzero_ps(); \ + row5 = _mm256_setzero_ps(); \ + row6 = _mm256_setzero_ps(); \ + row7 = _mm256_setzero_ps(); \ + +#define KERNEL8x8_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += ymm0 * ymm2; \ + row1 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += ymm0 * ymm2; \ + row3 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += ymm0 * ymm2; \ + row5 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += ymm0 * ymm2; \ + row7 += ymm0 * ymm3; \ + BO += 8; \ + AO += 8; + + +#define SAVE8x8(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + row0 *= ymm0; \ + row1 *= ymm0; \ + row2 *= ymm0; \ + row3 *= ymm0; \ + row4 *= ymm0; \ + row5 *= ymm0; \ + row6 *= ymm0; \ + row7 *= ymm0; \ + row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ + _mm256_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm256_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm256_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm256_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm256_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm256_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm256_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm256_storeu_ps(CO1 + 7 * ldc, row7); \ + + + +/*******************************************************************************************/ + +#define INIT4x8() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + row2 = _mm_setzero_ps(); \ + row3 = _mm_setzero_ps(); \ + row4 = _mm_setzero_ps(); \ + row5 = _mm_setzero_ps(); \ + row6 = _mm_setzero_ps(); \ + row7 = _mm_setzero_ps(); \ + + +#define KERNEL4x8_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += xmm0 * xmm2; \ + row5 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += xmm0 * xmm2; \ + row7 += xmm0 * xmm3; \ + BO += 8; \ + AO += 4; + + +#define SAVE4x8(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row4 *= xmm0; \ + row5 *= xmm0; \ + row6 *= xmm0; \ + row7 *= xmm0; \ + row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ + _mm_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm_storeu_ps(CO1 + 7 * ldc, row7); \ + + +/*******************************************************************************************/ + +#define INIT2x8() \ + row0a = row0b = 0; \ + row1a = row1b = 0; \ + row2a = row2b = 0; \ + row3a = row3b = 0; \ + row4a = row4b = 0; \ + row5a = row5b = 0; \ + row6a = row6b = 0; \ + row7a = row7b = 0; \ + +#define KERNEL2x8_SUB() \ + xmm0 = *(AO); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0a += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1a += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2a += xmm0 * xmm2; \ + row2b += xmm1 * xmm2; \ + row3a += xmm0 * xmm3; \ + row3b += xmm1 * xmm3; \ + xmm2 = *(BO + 4); \ + xmm3 = *(BO + 5); \ + row4a += xmm0 * xmm2; \ + row4b += xmm1 * xmm2; \ + row5a += xmm0 * xmm3; \ + row5b += xmm1 * xmm3; \ + xmm2 = *(BO + 6); \ + xmm3 = *(BO + 7); \ + row6a += xmm0 * xmm2; \ + row6b += xmm1 * xmm2; \ + row7a += xmm0 * xmm3; \ + row7b += xmm1 * xmm3; \ + BO += 8; \ + AO += 2; + + +#define SAVE2x8(ALPHA) \ + xmm0 = ALPHA; \ + row0a *= xmm0; \ + row0b *= xmm0; \ + row1a *= xmm0; \ + row1b *= xmm0; \ + row2a *= xmm0; \ + row2b *= xmm0; \ + row3a *= xmm0; \ + row3b *= xmm0; \ + row4a *= xmm0; \ + row4b *= xmm0; \ + row5a *= xmm0; \ + row5b *= xmm0; \ + row6a *= xmm0; \ + row6b *= xmm0; \ + row7a *= xmm0; \ + row7b *= xmm0; \ + *(CO1 + 0 * ldc + 0) += row0a; \ + *(CO1 + 0 * ldc + 1) += row0b; \ + *(CO1 + 1 * ldc + 0) += row1a; \ + *(CO1 + 1 * ldc + 1) += row1b; \ + *(CO1 + 2 * ldc + 0) += row2a; \ + *(CO1 + 2 * ldc + 1) += row2b; \ + *(CO1 + 3 * ldc + 0) += row3a; \ + *(CO1 + 3 * ldc + 1) += row3b; \ + *(CO1 + 4 * ldc + 0) += row4a; \ + *(CO1 + 4 * ldc + 1) += row4b; \ + *(CO1 + 5 * ldc + 0) += row5a; \ + *(CO1 + 5 * ldc + 1) += row5b; \ + *(CO1 + 6 * ldc + 0) += row6a; \ + *(CO1 + 6 * ldc + 1) += row6b; \ + *(CO1 + 7 * ldc + 0) += row7a; \ + *(CO1 + 7 * ldc + 1) += row7b; \ + + + +/*******************************************************************************************/ + +#define INIT1x8() \ + row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; + +#define KERNEL1x8_SUB() \ + xmm0 = *(AO ); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + xmm2 = *(BO + 4); \ + xmm3 = *(BO + 5); \ + row4 += xmm0 * xmm2; \ + row5 += xmm0 * xmm3; \ + xmm2 = *(BO + 6); \ + xmm3 = *(BO + 7); \ + row6 += xmm0 * xmm2; \ + row7 += xmm0 * xmm3; \ + BO += 8; \ + AO += 1; + + +#define SAVE1x8(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row4 *= xmm0; \ + row5 *= xmm0; \ + row6 *= xmm0; \ + row7 *= xmm0; \ + *(CO1 + 0 * ldc) += row0; \ + *(CO1 + 1 * ldc) += row1; \ + *(CO1 + 2 * ldc) += row2; \ + *(CO1 + 3 * ldc) += row3; \ + *(CO1 + 4 * ldc) += row4; \ + *(CO1 + 5 * ldc) += row5; \ + *(CO1 + 6 * ldc) += row6; \ + *(CO1 + 7 * ldc) += row7; \ + + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +#define INIT64x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + row0d = _mm512_setzero_ps(); \ + row1d = _mm512_setzero_ps(); \ + row2d = _mm512_setzero_ps(); \ + row3d = _mm512_setzero_ps(); \ + +#define KERNEL64x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm7 = _mm512_loadu_ps(A3); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + row0d += zmm7 * zmm2; \ + row1d += zmm7 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + row2d += zmm7 * zmm2; \ + row3d += zmm7 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; \ + A3 += 16; \ + + +#define SAVE64x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0d *= zmm0; \ + row1d *= zmm0; \ + row2d *= zmm0; \ + row3d *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); \ + row0d += _mm512_loadu_ps(CO1 + 0*ldc + 48); \ + row1d += _mm512_loadu_ps(CO1 + 1*ldc + 48); \ + row2d += _mm512_loadu_ps(CO1 + 2*ldc + 48); \ + row3d += _mm512_loadu_ps(CO1 + 3*ldc + 48); \ + _mm512_storeu_ps(CO1 + 0*ldc + 48, row0d); \ + _mm512_storeu_ps(CO1 + 1*ldc + 48, row1d); \ + _mm512_storeu_ps(CO1 + 2*ldc + 48, row2d); \ + _mm512_storeu_ps(CO1 + 3*ldc + 48, row3d); + + +#define INIT48x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + +#define KERNEL48x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; + + +#define SAVE48x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); + + +#define INIT32x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + +#define KERNEL32x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; + + +#define SAVE32x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); + + + +#define INIT16x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + +#define KERNEL16x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + BO += 4; \ + AO += 16; + + +#define SAVE16x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); + + + +/*******************************************************************************************/ + +#define INIT8x4() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + ymm8 = _mm256_setzero_ps(); \ + ymm10 = _mm256_setzero_ps(); \ + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ + ymm8 += ymm0 * ymm2; \ + ymm10 += ymm0 * ymm3; \ + BO += 4; \ + AO += 8; + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm8 *= ymm0; \ + ymm10 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1 + 0 * ldc); \ + ymm6 += _mm256_loadu_ps(CO1 + 1 * ldc); \ + ymm8 += _mm256_loadu_ps(CO1 + 2 * ldc); \ + ymm10 += _mm256_loadu_ps(CO1 + 3 * ldc); \ + _mm256_storeu_ps(CO1 + 0 * ldc, ymm4); \ + _mm256_storeu_ps(CO1 + 1 * ldc, ymm6); \ + _mm256_storeu_ps(CO1 + 2 * ldc, ymm8); \ + _mm256_storeu_ps(CO1 + 3 * ldc, ymm10); \ + + + +/*******************************************************************************************/ + +#define INIT4x4() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + row2 = _mm_setzero_ps(); \ + row3 = _mm_setzero_ps(); \ + + +#define KERNEL4x4_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 4; + + +#define SAVE4x4(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ + _mm_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm_storeu_ps(CO1 + 3 * ldc, row3); \ + + +/*******************************************************************************************/ + +#define INIT2x4() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + row2 = 0; row2b = 0; row3 = 0; row3b = 0; + +#define KERNEL2x4_SUB() \ + xmm0 = *(AO); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row2b += xmm1 * xmm2; \ + row3 += xmm0 * xmm3; \ + row3b += xmm1 * xmm3; \ + BO += 4; \ + AO += 2; + + +#define SAVE2x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + row2 *= xmm0; \ + row2b *= xmm0; \ + row3 *= xmm0; \ + row3b *= xmm0; \ + *(CO1 + 0 * ldc + 0) += row0; \ + *(CO1 + 0 * ldc + 1) += row0b; \ + *(CO1 + 1 * ldc + 0) += row1; \ + *(CO1 + 1 * ldc + 1) += row1b; \ + *(CO1 + 2 * ldc + 0) += row2; \ + *(CO1 + 2 * ldc + 1) += row2b; \ + *(CO1 + 3 * ldc + 0) += row3; \ + *(CO1 + 3 * ldc + 1) += row3b; \ + + + +/*******************************************************************************************/ + +#define INIT1x4() \ + row0 = 0; row1 = 0; row2 = 0; row3 = 0; +#define KERNEL1x4_SUB() \ + xmm0 = *(AO ); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + *(CO1 + 0 * ldc) += row0; \ + *(CO1 + 1 * ldc) += row1; \ + *(CO1 + 2 * ldc) += row2; \ + *(CO1 + 3 * ldc) += row3; \ + + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define INIT16x2() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + + +#define KERNEL16x2_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + BO += 2; \ + AO += 16; + + +#define SAVE16x2(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + row1 += _mm512_loadu_ps(CO1 + ldc); \ + _mm512_storeu_ps(CO1 , row0); \ + _mm512_storeu_ps(CO1 + ldc, row1); \ + + + + +/*******************************************************************************************/ + +#define INIT8x2() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + BO += 2; \ + AO += 8; + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + ymm6 += _mm256_loadu_ps(CO1 + ldc); \ + _mm256_storeu_ps(CO1 , ymm4); \ + _mm256_storeu_ps(CO1 + ldc, ymm6); \ + + + +/*******************************************************************************************/ + +#define INIT4x2() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 4; + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + row1 += _mm_loadu_ps(CO1 + ldc); \ + _mm_storeu_ps(CO1 , row0); \ + _mm_storeu_ps(CO1 + ldc, row1); \ + + + +/*******************************************************************************************/ + + +#define INIT2x2() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + +#define KERNEL2x2_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + BO += 2; \ + AO += 2; \ + + +#define SAVE2x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + *(CO1 + ldc ) += row1; \ + *(CO1 + ldc +1) += row1b; \ + + +/*******************************************************************************************/ + +#define INIT1x2() \ + row0 = 0; row1 = 0; + +#define KERNEL1x2_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 + ldc ) += row1; \ + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define INIT16x1() \ + row0 = _mm512_setzero_ps(); \ + +#define KERNEL16x1_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += zmm0 * zmm2; \ + BO += 1; \ + AO += 16; + + +#define SAVE16x1(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + _mm512_storeu_ps(CO1 , row0); \ + + +/*******************************************************************************************/ + +#define INIT8x1() \ + ymm4 = _mm256_setzero_ps(); + +#define KERNEL8x1_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + _mm256_storeu_ps(CO1 , ymm4); \ + + +/*******************************************************************************************/ + +#define INIT4x1() \ + row0 = _mm_setzero_ps(); \ + +#define KERNEL4x1_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + _mm_storeu_ps(CO1 , row0); \ + + + +/*******************************************************************************************/ + +#define INIT2x1() \ + row0 = 0; row0b = 0; + +#define KERNEL2x1_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + + +/*******************************************************************************************/ + +#define INIT1x1() \ + row0 = 0; + +#define KERNEL1x1_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 1; + + +#define SAVE1x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + *(CO1 ) += row0; \ + + +/*******************************************************************************************/ + + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) +{ + unsigned long M = m, N = n, K = k; + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + + + // L8_0 + while (N >= 8 && 0) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 8 * ldc; + + AO = A; + + i = m; + + while (i >= 32 && 0) { + float *BO, *AOb; + // L8_11 + __m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; + BO = B; + int kloop = K; + AOb = AO + 16 * K; + + INIT32x8() + + while (kloop > 0) { + // L12_17 + KERNEL32x8_SUB() + kloop--; + } + // L8_19 + SAVE32x8(alpha) + CO1 += 32; + AO += 16 * K; + + i -= 32; + } + while (i >= 16) { + float *BO; + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT16x8() + + while (kloop > 0) { + KERNEL16x8_SUB() + kloop--; + } + SAVE16x8(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + // L8_11 + __m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT8x8() + + while (kloop > 0) { + // L12_17 + KERNEL8x8_SUB() + kloop--; + } + // L8_19 + SAVE8x8(alpha) + CO1 += 8; + + i -= 8; + } + while (i >= 4) { + // L8_11 + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT4x8() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x8_SUB() + kloop--; + } + // L8_19 + SAVE4x8(alpha) + CO1 += 4; + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; + BO = B; + + INIT2x8() + int kloop = K; + + while (kloop > 0) { + KERNEL2x8_SUB() + kloop--; + } + SAVE2x8(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; + int kloop = K; + BO = B; + INIT1x8() + + while (kloop > 0) { + KERNEL1x8_SUB() + kloop--; + } + SAVE1x8(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 8; + N -= 8; + } + + while (N >= 4) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A; + + i = m; + while (i >= 64) { + float *BO; + float *A1, *A2, *A3; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, zmm5, row1, zmm7, row2, row3, row0b, row1b, row2b, row3b, row0c, row1c, row2c, row3c, row0d, row1d, row2d, row3d; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + A2 = A1 + 16 * K; + A3 = A2 + 16 * K; + + INIT64x4() + + while (kloop > 0) { + // L12_17 + KERNEL64x4_SUB() + kloop--; + } + // L8_19 + SAVE64x4(alpha) + CO1 += 64; + AO += 48 * K; + + i -= 64; + } + while (i >= 32) { + float *BO; + float *A1; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, row1, row2, row3, row0b, row1b, row2b, row3b; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + + INIT32x4() + + while (kloop > 0) { + // L12_17 + KERNEL32x4_SUB() + kloop--; + } + // L8_19 + SAVE32x4(alpha) + CO1 += 32; + AO += 16 * K; + + i -= 32; + } + while (i >= 16) { + float *BO; + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT16x4() + + while (kloop > 0) { + // L12_17 + KERNEL16x4_SUB() + kloop--; + } + // L8_19 + SAVE16x4(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + // L8_11 + __m256 ymm0, ymm2, ymm3, ymm4, ymm6,ymm8,ymm10; + BO = B; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + CO1 += 8; + + i -= 8; + } + while (i >= 4) { + // L8_11 + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + CO1 += 4; + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b, row2, row2b, row3, row3b; + BO = B; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1, row2, row3; + int kloop = K; + BO = B; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1; + BO = B; + int kloop = K; + + INIT16x2() + + while (kloop > 0) { + // L12_17 + KERNEL16x2_SUB() + kloop--; + } + // L8_19 + SAVE16x2(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm3, ymm4, ymm6; + // L8_11 + BO = B; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + CO1 += 8; + + i-=8; + } + + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1; + // L8_11 + BO = B; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + CO1 += 4; + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b; + int kloop = K; + BO = B; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1; + int kloop = K; + BO = B; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + float *CO1; + float *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + __m512 zmm0, zmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT16x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL16x1_SUB() + kloop--; + } + // L8_19 + SAVE16x1(alpha) + CO1 += 16; + + i-= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm4; + // L8_11 + BO = B; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + CO1 += 8; + + i-= 8; + } + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + CO1 += 4; + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, row0, row0b; + int kloop = K; + BO = B; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, row0; + int kloop = K; + + BO = B; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c new file mode 100644 index 000000000..8577e3b38 --- /dev/null +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#include + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + __m128 row0, row1, row2, row3; + + row0 = _mm_loadu_ps(a_offset1); + row1 = _mm_loadu_ps(a_offset2); + row2 = _mm_loadu_ps(a_offset3); + row3 = _mm_loadu_ps(a_offset4); + + _MM_TRANSPOSE4_PS(row0, row1, row2, row3); + + _mm_storeu_ps(b_offset + 0, row0); + _mm_storeu_ps(b_offset + 4, row1); + _mm_storeu_ps(b_offset + 8, row2); + _mm_storeu_ps(b_offset + 12, row3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/x86_64/sgemm_tcopy_16_skylakex.c b/kernel/x86_64/sgemm_tcopy_16_skylakex.c new file mode 100644 index 000000000..dbacc5081 --- /dev/null +++ b/kernel/x86_64/sgemm_tcopy_16_skylakex.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} From 84bcdf9c661fb7484fd9a95c292115234213497a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Oct 2018 19:15:32 +0200 Subject: [PATCH 073/236] Revert "Add -march=skylake-avx512 when required" --- cmake/system_check.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 9b8a3d39d..fe30c7600 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -71,8 +71,6 @@ if (X86_64 OR X86) execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") -else() -set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() file(REMOVE "avx512.tmp" "avx512.o") endif() From fa53b903db657b0d5f5bfe5554c7218442c539c9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Oct 2018 19:22:01 +0200 Subject: [PATCH 074/236] Add -march=skylake-avx512 to CFLAGS when the target is Skylake Should fix 1806 and #1801 --- cmake/system.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 18b2c3b87..4dc50e64f 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -41,6 +41,11 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () +if (DEFINED TARGET AND ${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") + set (FCOMMON_OPT "${FCOMMON_OPT} -march=skylake-avx512") +endif() + if (DEFINED TARGET) message(STATUS "Targeting the ${TARGET} architecture.") set(GETARCH_FLAGS "-DFORCE_${TARGET}") From 8a11ec19d1e4b5b8693f90b1932fb363e56c1200 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Oct 2018 23:47:35 +0200 Subject: [PATCH 075/236] Syntax fix --- cmake/system.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4dc50e64f..097e1cd5e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -41,10 +41,12 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () -if (DEFINED TARGET AND ${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) +if (DEFINED TARGET) +if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") set (FCOMMON_OPT "${FCOMMON_OPT} -march=skylake-avx512") endif() +endif() if (DEFINED TARGET) message(STATUS "Targeting the ${TARGET} architecture.") From 81c9985c3ad1a7a42c1ef5d7277050ecba470def Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Oct 2018 11:03:27 +0200 Subject: [PATCH 076/236] Use KERNEL_DEFINITIONS rather than COMMON_OPTS to pass -march=skylake-avx512 --- cmake/system.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 097e1cd5e..61f96edb0 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -43,8 +43,7 @@ endif () if (DEFINED TARGET) if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") - set (FCOMMON_OPT "${FCOMMON_OPT} -march=skylake-avx512") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() endif() From 55b244ca0da907b27c4e0306df0a1a90a2238c6a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 12 Oct 2018 09:30:35 +0000 Subject: [PATCH 077/236] enable the SGEMM/SKX C based kernel In QA the final bug was found so now the sklyakex sgemm C based kernel can be activated.... --- kernel/x86_64/KERNEL.SKYLAKEX | 9 +- kernel/x86_64/sgemm_beta_skylakex.c | 6 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 549 --------------------- 3 files changed, 10 insertions(+), 554 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 48c81e80b..acc6356d6 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,6 +1,11 @@ include $(KERNELDIR)/KERNEL.HASWELL -SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c + +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_skylakex.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c @@ -9,5 +14,5 @@ DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c -SGEMM_BETA = ../generic/gemm_beta.c +SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index b1bf4d77a..54f9664e9 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -60,8 +60,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, if (beta == ZERO){ __m512 z_zero; + __m256 y_zero; z_zero = _mm512_setzero_ps(); + y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -71,14 +73,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, while (i > 32) { _mm512_storeu_ps(c_offset1, z_zero); - _mm512_storeu_ps(c_offset1 + 8, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); - _mm512_storeu_ps(c_offset1 + 24 , z_zero); c_offset1 += 32; i -= 32; } while (i > 8) { - _mm512_storeu_ps(c_offset1, z_zero); + _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index b2b1ab03f..10d3d22ed 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define INIT32x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row0b = _mm512_setzero_ps(); \ - row1b = _mm512_setzero_ps(); \ - row2b = _mm512_setzero_ps(); \ - row3b = _mm512_setzero_ps(); \ - row4b = _mm512_setzero_ps(); \ - row5b = _mm512_setzero_ps(); \ - row6b = _mm512_setzero_ps(); \ - row7b = _mm512_setzero_ps(); \ - -#define KERNEL32x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm0b = _mm512_loadu_ps(AOb); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - row0b += zmm0b * zmm2; \ - row1b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - row2b += zmm0b * zmm2; \ - row3b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - row4b += zmm0b * zmm2; \ - row5b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - row6b += zmm0b * zmm2; \ - row7b += zmm0b * zmm3; \ - BO += 8; \ - AO += 16; \ - AOb += 16; - - -#define SAVE32x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0b *= zmm0; \ - row1b *= zmm0; \ - row2b *= zmm0; \ - row3b *= zmm0; \ - row4b *= zmm0; \ - row5b *= zmm0; \ - row6b *= zmm0; \ - row7b *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); \ - row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ - row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ - row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ - row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ - row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ - row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ - row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ - row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ - _mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ - _mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ - _mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ - _mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ - _mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ - _mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ - _mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ - _mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ - - -#define INIT16x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row7 = _mm512_setzero_ps(); \ - -#define KERNEL16x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - BO += 8; \ - AO += 16; - - -#define SAVE16x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); - - - -/*******************************************************************************************/ - -#define INIT8x8() \ - row0 = _mm256_setzero_ps(); \ - row1 = _mm256_setzero_ps(); \ - row2 = _mm256_setzero_ps(); \ - row3 = _mm256_setzero_ps(); \ - row4 = _mm256_setzero_ps(); \ - row5 = _mm256_setzero_ps(); \ - row6 = _mm256_setzero_ps(); \ - row7 = _mm256_setzero_ps(); \ - -#define KERNEL8x8_SUB() \ - ymm0 = _mm256_loadu_ps(AO); \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += ymm0 * ymm2; \ - row1 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += ymm0 * ymm2; \ - row3 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += ymm0 * ymm2; \ - row5 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += ymm0 * ymm2; \ - row7 += ymm0 * ymm3; \ - BO += 8; \ - AO += 8; - - -#define SAVE8x8(ALPHA) \ - ymm0 = _mm256_set1_ps(ALPHA); \ - row0 *= ymm0; \ - row1 *= ymm0; \ - row2 *= ymm0; \ - row3 *= ymm0; \ - row4 *= ymm0; \ - row5 *= ymm0; \ - row6 *= ymm0; \ - row7 *= ymm0; \ - row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ - _mm256_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm256_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm256_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm256_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm256_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm256_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm256_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm256_storeu_ps(CO1 + 7 * ldc, row7); \ - - - -/*******************************************************************************************/ - -#define INIT4x8() \ - row0 = _mm_setzero_ps(); \ - row1 = _mm_setzero_ps(); \ - row2 = _mm_setzero_ps(); \ - row3 = _mm_setzero_ps(); \ - row4 = _mm_setzero_ps(); \ - row5 = _mm_setzero_ps(); \ - row6 = _mm_setzero_ps(); \ - row7 = _mm_setzero_ps(); \ - - -#define KERNEL4x8_SUB() \ - xmm0 = _mm_loadu_ps(AO); \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 4; - - -#define SAVE4x8(ALPHA) \ - xmm0 = _mm_set1_ps(ALPHA); \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ - _mm_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm_storeu_ps(CO1 + 7 * ldc, row7); \ - - -/*******************************************************************************************/ - -#define INIT2x8() \ - row0a = row0b = 0; \ - row1a = row1b = 0; \ - row2a = row2b = 0; \ - row3a = row3b = 0; \ - row4a = row4b = 0; \ - row5a = row5b = 0; \ - row6a = row6b = 0; \ - row7a = row7b = 0; \ - -#define KERNEL2x8_SUB() \ - xmm0 = *(AO); \ - xmm1 = *(AO + 1); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0a += xmm0 * xmm2; \ - row0b += xmm1 * xmm2; \ - row1a += xmm0 * xmm3; \ - row1b += xmm1 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2a += xmm0 * xmm2; \ - row2b += xmm1 * xmm2; \ - row3a += xmm0 * xmm3; \ - row3b += xmm1 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4a += xmm0 * xmm2; \ - row4b += xmm1 * xmm2; \ - row5a += xmm0 * xmm3; \ - row5b += xmm1 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6a += xmm0 * xmm2; \ - row6b += xmm1 * xmm2; \ - row7a += xmm0 * xmm3; \ - row7b += xmm1 * xmm3; \ - BO += 8; \ - AO += 2; - - -#define SAVE2x8(ALPHA) \ - xmm0 = ALPHA; \ - row0a *= xmm0; \ - row0b *= xmm0; \ - row1a *= xmm0; \ - row1b *= xmm0; \ - row2a *= xmm0; \ - row2b *= xmm0; \ - row3a *= xmm0; \ - row3b *= xmm0; \ - row4a *= xmm0; \ - row4b *= xmm0; \ - row5a *= xmm0; \ - row5b *= xmm0; \ - row6a *= xmm0; \ - row6b *= xmm0; \ - row7a *= xmm0; \ - row7b *= xmm0; \ - *(CO1 + 0 * ldc + 0) += row0a; \ - *(CO1 + 0 * ldc + 1) += row0b; \ - *(CO1 + 1 * ldc + 0) += row1a; \ - *(CO1 + 1 * ldc + 1) += row1b; \ - *(CO1 + 2 * ldc + 0) += row2a; \ - *(CO1 + 2 * ldc + 1) += row2b; \ - *(CO1 + 3 * ldc + 0) += row3a; \ - *(CO1 + 3 * ldc + 1) += row3b; \ - *(CO1 + 4 * ldc + 0) += row4a; \ - *(CO1 + 4 * ldc + 1) += row4b; \ - *(CO1 + 5 * ldc + 0) += row5a; \ - *(CO1 + 5 * ldc + 1) += row5b; \ - *(CO1 + 6 * ldc + 0) += row6a; \ - *(CO1 + 6 * ldc + 1) += row6b; \ - *(CO1 + 7 * ldc + 0) += row7a; \ - *(CO1 + 7 * ldc + 1) += row7b; \ - - - -/*******************************************************************************************/ - -#define INIT1x8() \ - row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; - -#define KERNEL1x8_SUB() \ - xmm0 = *(AO ); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 1; - - -#define SAVE1x8(ALPHA) \ - xmm0 = ALPHA; \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - *(CO1 + 0 * ldc) += row0; \ - *(CO1 + 1 * ldc) += row1; \ - *(CO1 + 2 * ldc) += row2; \ - *(CO1 + 3 * ldc) += row3; \ - *(CO1 + 4 * ldc) += row4; \ - *(CO1 + 5 * ldc) += row5; \ - *(CO1 + 6 * ldc) += row6; \ - *(CO1 + 7 * ldc) += row7; \ @@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; - - // L8_0 - while (N >= 8 && 0) { - float *CO1; - float *AO; - int i; - // L8_10 - CO1 = C; - C += 8 * ldc; - - AO = A; - - i = m; - - while (i >= 32 && 0) { - float *BO, *AOb; - // L8_11 - __m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - int kloop = K; - AOb = AO + 16 * K; - - INIT32x8() - - while (kloop > 0) { - // L12_17 - KERNEL32x8_SUB() - kloop--; - } - // L8_19 - SAVE32x8(alpha) - CO1 += 32; - AO += 16 * K; - - i -= 32; - } - while (i >= 16) { - float *BO; - // L8_11 - __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT16x8() - - while (kloop > 0) { - KERNEL16x8_SUB() - kloop--; - } - SAVE16x8(alpha) - CO1 += 16; - - i -= 16; - } - while (i >= 8) { - float *BO; - // L8_11 - __m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT8x8() - - while (kloop > 0) { - // L12_17 - KERNEL8x8_SUB() - kloop--; - } - // L8_19 - SAVE8x8(alpha) - CO1 += 8; - - i -= 8; - } - while (i >= 4) { - // L8_11 - float *BO; - __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT4x8() - // L8_16 - while (kloop > 0) { - // L12_17 - KERNEL4x8_SUB() - kloop--; - } - // L8_19 - SAVE4x8(alpha) - CO1 += 4; - - i -= 4; - } - -/************************************************************************** -* Rest of M -***************************************************************************/ - - while (i >= 2) { - float *BO; - float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - - INIT2x8() - int kloop = K; - - while (kloop > 0) { - KERNEL2x8_SUB() - kloop--; - } - SAVE2x8(alpha) - CO1 += 2; - i -= 2; - } - // L13_40 - while (i >= 1) { - float *BO; - float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - int kloop = K; - BO = B; - INIT1x8() - - while (kloop > 0) { - KERNEL1x8_SUB() - kloop--; - } - SAVE1x8(alpha) - CO1 += 1; - i -= 1; - } - - B += K * 8; - N -= 8; - } - while (N >= 4) { float *CO1; float *AO; From c3d93caa8d58e18422014c3ceb4f49ea73cd1f96 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:01:27 -0700 Subject: [PATCH 078/236] ARM64: Remove dependency of XGENE1 Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.XGENE1 | 136 ++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 index 6ee0c730c..d05754628 100644 --- a/kernel/arm64/KERNEL.XGENE1 +++ b/kernel/arm64/KERNEL.XGENE1 @@ -1 +1,135 @@ -include $(KERNELDIR)/KERNEL.ARMV8 \ No newline at end of file +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + From 162e31283276a7c108968f3309e2e3371b639bc3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:01:45 -0700 Subject: [PATCH 079/236] ARM64: Remove dependency of CORTEXA57 Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.CORTEXA57 | 47 ++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 371e488cd..2fd2c3d87 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -1,4 +1,49 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S From 8001fdcd2a6796c0747e5df25c38a082c0261b0f Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:02:16 -0700 Subject: [PATCH 080/236] ARM64: Remove dependency of THUNDERX Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.THUNDERX | 135 +++++++++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index 11b7a2ca8..e19655e8c 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -1,6 +1,133 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx.c +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot_thunderx.c +DDOTKERNEL = ddot_thunderx.c +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SDOTKERNEL=dot_thunderx.c -DDOTKERNEL=ddot_thunderx.c -DAXPYKERNEL=daxpy_thunderx.c From caf339412f9e828ffd3e43ec4b58ecd992eeff7a Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:02:40 -0700 Subject: [PATCH 081/236] ARM64: Remove dependency of THUNDERX2T99 Makefile on CORTEXA57 Makefile --- kernel/arm64/KERNEL.THUNDERX2T99 | 137 ++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index b66cd0e8b..a73d4cee8 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -1,4 +1,137 @@ -include $(KERNELDIR)/KERNEL.CORTEXA57 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c @@ -27,12 +160,12 @@ CNRM2KERNEL = scnrm2_thunderx2t99.c DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DAXPYKERNEL = daxpy_thunderx2t99.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S From 21f46a1cf2cefbdedf89878e3a6324578d0fe8ca Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:11:27 -0700 Subject: [PATCH 082/236] ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8 Currently the generic ARMV8 target uses C implementations for many routines. Replace these with the neon implementations written for THUNDERX2T99 target which are upto 6x faster for certain routines. --- driver/others/parameter.c | 4 +- interface/swap.c | 2 +- kernel/arm64/KERNEL.ARMV8 | 276 +++++++++++++++++++++++++------------- param.h | 47 ++++++- 4 files changed, 224 insertions(+), 105 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index e7332c0c4..0f2364d9f 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,7 +730,7 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) unsigned long dgemm_prefetch_size_a; unsigned long dgemm_prefetch_size_b; unsigned long dgemm_prefetch_size_c; @@ -738,7 +738,7 @@ unsigned long dgemm_prefetch_size_c; void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) dgemm_p = 160; dgemm_q = 128; dgemm_r = 4096; diff --git a/interface/swap.c b/interface/swap.c index f7642edf1..17a9868a9 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 4c6d6fb71..7e7a900fb 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -1,8 +1,3 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c @@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c @@ -30,92 +20,6 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S - -ifneq ($(OS_DARWIN)$(CROSS),11) -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S -endif - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -ifneq ($(OS_DARWIN)$(CROSS),11) -SGEMMKERNEL = sgemm_kernel_4x4.S -else -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -endif -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -136,6 +40,186 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +ifneq ($(OS_DARWIN)$(CROSS),11) +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c +endif + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifneq ($(OS_DARWIN)$(CROSS),11) + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif + +else + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +endif diff --git a/param.h b/param.h index ded9fe0b8..c7952e113 100644 --- a/param.h +++ b/param.h @@ -2583,6 +2583,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(ARMV8) + +#if defined(OS_DARWIN) && defined(CROSS) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2590,13 +2592,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#if defined(OS_DARWIN) && defined(CROSS) #define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL N 2 -#else -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 -#endif +#define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 @@ -2622,10 +2619,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#define SYMV_P 16 +#else + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p + +#define SGEMM_DEFAULT_Q sgemm_q +#define DGEMM_DEFAULT_Q dgemm_q +#define CGEMM_DEFAULT_Q cgemm_q +#define ZGEMM_DEFAULT_Q zgemm_q + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r #define SYMV_P 16 #endif +#endif + #if defined(THUNDERX) #define SNUMOPT 2 #define DNUMOPT 2 From 898a8dcaba6d86358ae73575926f8689d6ede155 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 10:55:04 +0300 Subject: [PATCH 083/236] init From c7bbf9c987a0473aafbd8a4f48ed07cd52fccc38 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 11:13:29 +0300 Subject: [PATCH 084/236] Attempt to tame _hemv threading #1820 --- interface/zhemv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/interface/zhemv.c b/interface/zhemv.c index d1996ad69..8995ca1c2 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -195,7 +195,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + // see graph in issue #1820 for explanation and room for improvement + if (n<362) { + nthreads = 1 ; + } else { + nthreads = num_cpu_avail(2); + }; if (nthreads == 1) { #endif From a293bdcd5eaa610ed960264c4e1c48af662502e9 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 21:37:53 +0300 Subject: [PATCH 085/236] re-arrange new code for readability --- interface/zhemv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/interface/zhemv.c b/interface/zhemv.c index 8995ca1c2..9c31f31d9 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -43,6 +43,10 @@ #include "functable.h" #endif +// this is smallest dimension N of square input a to permit threading +// see graph in issue #1820 for explanation +#define MULTI_THREAD_MINIMAL 362 + #ifdef XDOUBLE #define ERROR_NAME "XHEMV " #elif defined(DOUBLE) @@ -195,8 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - // see graph in issue #1820 for explanation and room for improvement - if (n<362) { + if (n Date: Thu, 18 Oct 2018 04:51:24 -0700 Subject: [PATCH 086/236] ARM64: Remove XGENE1 references Remove XGENE1 target as the implementation for the same is incomplete. Moreover whoever wishes to use on XGENE1 can use the generic ARMV8 target as there are no XGENE1 specific optimizations in OpenBLAS. --- kernel/arm64/KERNEL.XGENE1 | 135 ------------------------------------- 1 file changed, 135 deletions(-) delete mode 100644 kernel/arm64/KERNEL.XGENE1 diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 deleted file mode 100644 index d05754628..000000000 --- a/kernel/arm64/KERNEL.XGENE1 +++ /dev/null @@ -1,135 +0,0 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = sgemm_kernel_4x4.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - From d50abc8903089089357766d3ada7db090ff6e63d Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:02:23 -0700 Subject: [PATCH 087/236] ARM64: Move parameters from parameter.c to param.h Remove the runtime setting of P, Q, R parameters for targets ARMV8, THUNDERX2T99. Instead set them as constants in param.h at compile time. --- driver/others/parameter.c | 27 ----------- kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S | 10 ++-- param.h | 48 ++++++++++---------- 3 files changed, 27 insertions(+), 58 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 0f2364d9f..8bf7da78b 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,35 +730,8 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) -unsigned long dgemm_prefetch_size_a; -unsigned long dgemm_prefetch_size_b; -unsigned long dgemm_prefetch_size_c; -#endif - void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) - dgemm_p = 160; - dgemm_q = 128; - dgemm_r = 4096; - - sgemm_p = 128; - sgemm_q = 352; - sgemm_r = 4096; - - cgemm_p = 128; - cgemm_q = 224; - cgemm_r = 4096; - - zgemm_p = 128; - zgemm_q = 112; - zgemm_r = 4096; - - dgemm_prefetch_size_a = 3584; - dgemm_prefetch_size_b = 512; - dgemm_prefetch_size_c = 128; -#endif } #endif diff --git a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S index 598db6e0c..d1551ffea 100644 --- a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S @@ -943,13 +943,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] - - ldr A_PRE_SIZE, =dgemm_prefetch_size_a - ldr A_PRE_SIZE, [A_PRE_SIZE] - ldr B_PRE_SIZE, =dgemm_prefetch_size_b - ldr B_PRE_SIZE, [B_PRE_SIZE] - ldr C_PRE_SIZE, =dgemm_prefetch_size_c - ldr C_PRE_SIZE, [C_PRE_SIZE] + mov A_PRE_SIZE, #3584 + mov B_PRE_SIZE, #512 + mov C_PRE_SIZE, #128 add A_PRE_SIZE_64, A_PRE_SIZE, #64 add B_PRE_SIZE_64, B_PRE_SIZE, #64 diff --git a/param.h b/param.h index c7952e113..e4ec1b2b5 100644 --- a/param.h +++ b/param.h @@ -2641,20 +2641,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p -#define CGEMM_DEFAULT_P cgemm_p -#define ZGEMM_DEFAULT_P zgemm_p +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q sgemm_q -#define DGEMM_DEFAULT_Q dgemm_q -#define CGEMM_DEFAULT_Q cgemm_q -#define ZGEMM_DEFAULT_Q zgemm_q +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif @@ -2720,20 +2720,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p -#define CGEMM_DEFAULT_P cgemm_p -#define ZGEMM_DEFAULT_P zgemm_p +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q sgemm_q -#define DGEMM_DEFAULT_Q dgemm_q -#define CGEMM_DEFAULT_Q cgemm_q -#define ZGEMM_DEFAULT_Q zgemm_q +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif From e7b66cd36e12845701aaae979c29120439294368 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:13:02 -0700 Subject: [PATCH 088/236] ARM64: Fix DYNAMIC_ARCH compilation for cores which dont use GEMM3M --- kernel/Makefile | 4 ++ kernel/setparam-ref.c | 85 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index a0a8fcd21..923ffc363 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -88,7 +88,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h +ifeq ($(USE_GEMM3M), 1) + $(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@ +else $(CC) -c $(CFLAGS) $< -o $@ +endif setparam$(TSUFFIX).c : setparam-ref.c sed 's/TS/$(TSUFFIX)/g' $< > $(@F) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f654de110..e035d5bda 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -294,6 +294,8 @@ gotoblas_t TABLE_NAME = { chemm_outcopyTS, chemm_oltcopyTS, 0, 0, 0, + +#if defined(USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), #else @@ -324,6 +326,33 @@ gotoblas_t TABLE_NAME = { chemm3m_oucopybTS, chemm3m_olcopybTS, chemm3m_oucopyrTS, chemm3m_olcopyrTS, chemm3m_oucopyiTS, chemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK cneg_tcopyTS, claswp_ncopyTS, @@ -400,6 +429,7 @@ gotoblas_t TABLE_NAME = { zhemm_outcopyTS, zhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) #ifdef ZGEMM3M_DEFAULT_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), #else @@ -430,6 +460,33 @@ gotoblas_t TABLE_NAME = { zhemm3m_oucopybTS, zhemm3m_olcopybTS, zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK zneg_tcopyTS, zlaswp_ncopyTS, @@ -503,6 +560,7 @@ gotoblas_t TABLE_NAME = { xhemm_outcopyTS, xhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), xgemm3m_kernelTS, @@ -528,6 +586,33 @@ gotoblas_t TABLE_NAME = { xhemm3m_oucopybTS, xhemm3m_olcopybTS, xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK xneg_tcopyTS, xlaswp_ncopyTS, From af2837c392344c54e03e517902ae4fa4983570c0 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 22 Oct 2018 01:49:16 -0700 Subject: [PATCH 089/236] ARM64: Remove #define ARMV8 for THUNDERX --- cpuid_arm64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a42346c88..17078fe7f 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -237,7 +237,6 @@ void get_cpuconfig(void) break; case CPU_THUNDERX: - printf("#define ARMV8\n"); printf("#define THUNDERX\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 128\n"); From d5aeff636f2d8ba99d1e5ed511c3770970f440af Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:15:45 -0700 Subject: [PATCH 090/236] ARM64: Enable DYNAMIC_ARCH Enable DYNAMIC_ARCH feature on ARM64. This patch uses the cpuid feature in linux kernel to detect the core type at runtime (https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt). If this feature is missing in kernel, then the user should use the OPENBLAS_CORETYPE env variable to select the desired core type. --- Makefile.system | 7 ++ driver/others/Makefile | 8 ++ driver/others/dynamic_arm64.c | 198 +++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.ARMV8 | 48 ++++---- kernel/arm64/KERNEL.CORTEXA57 | 32 ++--- kernel/arm64/KERNEL.THUNDERX | 16 +-- kernel/arm64/KERNEL.THUNDERX2T99 | 32 ++--- kernel/setparam-ref.c | 73 ++++++++++++ 8 files changed, 350 insertions(+), 64 deletions(-) create mode 100644 driver/others/dynamic_arm64.c diff --git a/Makefile.system b/Makefile.system index b4cd4222a..7847c7525 100644 --- a/Makefile.system +++ b/Makefile.system @@ -510,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT) #CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' endif +ifeq ($(ARCH), arm64) +DYNAMIC_CORE = ARMV8 +DYNAMIC_CORE += CORTEXA57 +DYNAMIC_CORE += THUNDERX +DYNAMIC_CORE += THUNDERX2T99 +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= diff --git a/driver/others/Makefile b/driver/others/Makefile index e61ba7bc8..3dc2e7c1b 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -15,7 +15,11 @@ endif # COMMONOBJS += info.$(SUFFIX) ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +COMMONOBJS += dynamic_arm64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c endif ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c new file mode 100644 index 000000000..b4ce6b67d --- /dev/null +++ b/driver/others/dynamic_arm64.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include +#include + +extern gotoblas_t gotoblas_ARMV8; +extern gotoblas_t gotoblas_CORTEXA57; +extern gotoblas_t gotoblas_THUNDERX; +extern gotoblas_t gotoblas_THUNDERX2T99; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 4 + +/* + * In case asm/hwcap.h is outdated on the build system, make sure + * that HWCAP_CPUID is defined + */ +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif + +#define get_cpu_ftr(id, var) ({ \ + asm("mrs %0, "#id : "=r" (var)); \ + }) + +static char *corename[] = { + "armv8", + "cortexa57", + "thunderx", + "thunderx2t99", + "unknown" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; + if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; + if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; + if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i ; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_ARMV8); + case 1: return (&gotoblas_CORTEXA57); + case 2: return (&gotoblas_THUNDERX); + case 3: return (&gotoblas_THUNDERX2T99); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int implementer, variant, part, arch, revision, midr_el1; + + if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { + char coremsg[128]; + snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); + openblas_warning(1, coremsg); + return NULL; + } + + get_cpu_ftr(MIDR_EL1, midr_el1); + /* + * MIDR_EL1 + * + * 31 24 23 20 19 16 15 4 3 0 + * ----------------------------------------------------------------- + * | Implementer | Variant | Architecture | Part Number | Revision | + * ----------------------------------------------------------------- + */ + implementer = (midr_el1 >> 24) & 0xFF; + part = (midr_el1 >> 4) & 0xFFF; + + switch(implementer) + { + case 0x41: // ARM + switch (part) + { + case 0xd07: // Cortex A57 + case 0xd08: // Cortex A72 + case 0xd03: // Cortex A53 + return &gotoblas_CORTEXA57; + } + break; + case 0x42: // Broadcom + switch (part) + { + case 0x516: // Vulcan + return &gotoblas_THUNDERX2T99; + } + break; + case 0x43: // Cavium + switch (part) + { + case 0x0a1: // ThunderX + return &gotoblas_THUNDERX; + case 0x0af: // ThunderX2 + return &gotoblas_THUNDERX2T99; + } + break; + } + return NULL; +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_ARMV8; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 7e7a900fb..bcecd0026 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -113,13 +113,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -134,8 +134,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -146,34 +146,34 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S @@ -201,25 +201,25 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 2fd2c3d87..04d6940d7 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -111,13 +111,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -132,8 +132,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -144,32 +144,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index e19655e8c..cb02c7bc5 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -89,26 +89,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = sgemm_kernel_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a73d4cee8..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -74,13 +74,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -94,8 +94,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -106,32 +106,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index e035d5bda..6d4028b0b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -646,6 +646,78 @@ gotoblas_t TABLE_NAME = { }; +#if defined(ARCH_ARM64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif + +} +#else // defined(ARCH_ARM64) #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1231,3 +1303,4 @@ static void init_parameter(void) { } +#endif //defined(ARCH_ARM64) From 2992e3886aa6304ac2715890f4fbd8548e891c53 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Mon, 22 Oct 2018 23:21:49 +0300 Subject: [PATCH 091/236] disable threading in C/ZSWAP copying from S/DSWAP --- interface/zswap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/interface/zswap.c b/interface/zswap.c index e33bbafba..372b15447 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,6 +42,14 @@ #include "functable.h" #endif +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +// Multithreaded swap gives performance benefits in ThunderX2T99 +#else +// Disable multi-threading as it does not show any performance +// benefits. Keep the multi-threading code for the record. +#undef SMP +#endif + #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -81,7 +89,7 @@ FLOAT *y = (FLOAT*)vy; #ifdef SMP //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) + if (incx == 0 || incy == 0 || n < 1048576 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) nthreads = 1; else nthreads = num_cpu_avail(1); From 2d8064174c444bb377cc2e3879a9c8e76e45b314 Mon Sep 17 00:00:00 2001 From: fengrl <42458138+fengrl@users.noreply.github.com> Date: Fri, 26 Oct 2018 17:55:15 +0800 Subject: [PATCH 092/236] register push/pop command change 64bit push/pop register command should be used. Otherwise, data will lost. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 36 ++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 37b20a880..82703ff5d 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -146,11 +146,11 @@ sd $21, 40($sp) sd $22, 48($sp) - ST $f24, 56($sp) - ST $f25, 64($sp) - ST $f26, 72($sp) - ST $f27, 80($sp) - ST $f28, 88($sp) + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) @@ -161,10 +161,10 @@ #endif #ifndef __64BIT__ - ST $f20,120($sp) - ST $f21,128($sp) - ST $f22,136($sp) - ST $f23,144($sp) + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) #endif .align 4 @@ -7766,11 +7766,11 @@ ld $21, 40($sp) ld $22, 48($sp) - LD $f24, 56($sp) - LD $f25, 64($sp) - LD $f26, 72($sp) - LD $f27, 80($sp) - LD $f28, 88($sp) + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) @@ -7779,10 +7779,10 @@ #endif #ifndef __64BIT__ - LD $f20,120($sp) - LD $f21,128($sp) - LD $f22,136($sp) - LD $f23,144($sp) + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE From 64ca44873bd9d960c63456a43fd565c56514e895 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Oct 2018 18:36:55 +0100 Subject: [PATCH 093/236] Fix detection of Ryzen2 (missing CORE_ZEN) --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 512ad877b..8e4a7cb84 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2009,6 +2009,8 @@ int get_coretype(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // Ryzen 2 if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; From 38cf5d93647bf5ffb5fe3e17447eba0c157bb305 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Sun, 28 Oct 2018 21:16:52 +0000 Subject: [PATCH 094/236] ensure that threading has been initialized in the first place before calling openblas_set_num_threads --- driver/others/blas_server.c | 5 +++++ driver/others/blas_server_win32.c | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6a25e2d07..e5db1804f 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) { long i; +#ifdef SMP_SERVER + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + if (num_threads < 1) num_threads = blas_num_threads; #ifndef NO_AFFINITY diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 02a25ac39..bae344c59 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){ void goto_set_num_threads(int num_threads) { - long i; + long i; + +#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif if (num_threads < 1) num_threads = blas_cpu_number; From 326d394a0fbcc8226bb958f523ca1005696c33b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Oct 2018 18:38:22 +0100 Subject: [PATCH 095/236] Add get_num_procs implementation for AIX (and copy HAIKU implementation to the non-TLS version of the code as well) --- driver/others/memory.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 4a8e6c067..25f198623 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -259,6 +259,16 @@ int get_num_procs(void) { } #endif +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + + + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -1738,6 +1748,22 @@ int get_num_procs(void) { return nums; } #endif + +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif #ifdef OS_WINDOWS From 7b5aea52bb105c15d7e80e0749b80f6bfb0566b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Oct 2018 21:50:34 +0100 Subject: [PATCH 096/236] Accomodate AIX install, which has different syntax for #1803 --- Makefile.install | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Makefile.install b/Makefile.install index fa657beba..7aa477cf0 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,6 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif +ifneq (($OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif + ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif +else +#install on AIX has different options syntax +ifndef NO_LAPACKE + @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" +endif + +#for install static library +ifndef NO_STATIC + @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @install -M 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +endif +#for install shared library +ifndef NO_SHARED + @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @install -M 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) +endif + +endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" From dcc5d6291e7b02761acfb6161c04ba1f8f25b502 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:42:09 +0000 Subject: [PATCH 097/236] skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case in the threading code there are cases where N or M can become 0, and the optimized beta code did not handle this well, leading to a crash during the audit for the crash a few edge conditions on the if statements were found and fixed as well --- kernel/x86_64/dgemm_beta_skylakex.c | 6 ++++-- kernel/x86_64/sgemm_beta_skylakex.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 384e9f60b..6a824c9b5 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (m == 0 || n == 0) + return 0; c_offset = c; @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm512_storeu_pd(c_offset1, z_zero); c_offset1 += 8; i -= 8; diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 54f9664e9..4e40acadf 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (n == 0 || m == 0) + return; c_offset = c; @@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; From 5b708e5eb1b17af9c45e0da2993da8a4756cb912 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:43:20 +0000 Subject: [PATCH 098/236] sgemm/dgemm: add a way for an arch kernel to specify prefered sizes The current gemm threading code can make very unfortunate choices, for example on my 10 core system a 1024x1024x1024 matrix multiply ends up chunking into blocks of 102... which is not a vector friendly size and performance ends up horrible. this patch adds a helper define where an architecture can specify a preference for size multiples. This is different from existing defines that are minimum sizes and such. The performance increase with this patch for the 1024x1024x1024 sgemm is 2.3x (!!) --- driver/level3/level3_thread.c | 22 ++++++++++++++++++++++ param.h | 1 + 2 files changed, 23 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..de29247d4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -48,6 +48,10 @@ #define SWITCH_RATIO 2 #endif +#ifndef GEMM_PREFERED_SIZE +#define GEMM_PREFERED_SIZE 1 +#endif + //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, return 0; } +static int round_up(int remainder, int width, int multiple) +{ + if (multiple > remainder || width <= multiple) + return width; + width = (width + multiple - 1) / multiple; + width = width * multiple; + return width; +} + + static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { @@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); + + width = round_up(m, width, GEMM_PREFERED_SIZE); + m -= width; + if (m < 0) width = width + m; range_M[num_parts + 1] = range_M[num_parts] + width; + num_parts ++; } for (i = num_parts; i < MAX_CPU_NUMBER; i++) { @@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < SWITCH_RATIO) { width = SWITCH_RATIO; } + width = round_up(n, width, GEMM_PREFERED_SIZE); + n -= width; if (n < 0) width = width + n; range_N[num_parts + 1] = range_N[num_parts] + width; + num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { diff --git a/param.h b/param.h index e4ec1b2b5..d1b211584 100644 --- a/param.h +++ b/param.h @@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 32 #ifdef ARCH_X86 From b0255231979ac40444fea06bc8958731fdcdef7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Nov 2018 18:26:08 +0100 Subject: [PATCH 099/236] Use installbsd on AIX (and fix misplaced parenthesis from previous commit). See #1803 --- Makefile.install | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile.install b/Makefile.install index 7aa477cf0..069c96c6a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,7 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif -ifneq (($OSNAME), AIX) +ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -99,23 +99,23 @@ else #install on AIX has different options syntax ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -M 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -M 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) From 9c177d270b7ae78c4542a15ec02d8cab9cc7f367 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Nov 2018 18:50:25 +0100 Subject: [PATCH 100/236] Restore Android/ARMv7 build fix from #778 for #1811 --- lapack-netlib/LAPACKE/include/lapacke_config.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 1e2509bf0..8262c3488 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -34,6 +34,13 @@ #ifndef _LAPACKE_CONFIG_H_ #define _LAPACKE_CONFIG_H_ +// For Android prior to API 21 (no include) +#if defined(__ANDROID__) +#if __ANDROID_API__ < 21 +#define LAPACK_COMPLEX_STRUCTURE +#endif +#endif + #ifdef __cplusplus #if defined(LAPACK_COMPLEX_CPP) #include From fb5b2177ca794f81f85530f223dd630e147092ca Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Mon, 5 Nov 2018 11:30:12 +0000 Subject: [PATCH 101/236] [Arm64) Revert A53 detection as A57 This patch reverts the decision of treating A53 like A57, which was based on an analysis done on server class hardware and is not representative of all A53s out there. Fixes #1855. --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 17078fe7f..3acb395b5 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -115,8 +115,8 @@ int detect(void) fclose(infile); if(cpu_part != NULL && cpu_implementer != NULL) { if (strstr(cpu_implementer, "0x41") && - (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") )) - return CPU_CORTEXA57; //or compatible A53, A72 + (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08"))) + return CPU_CORTEXA57; //or compatible, ex. A72 else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) return CPU_VULCAN; else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) From 066f8065d19c5d0e9525e9ccf99e6dac9712dffa Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 08:19:08 +0000 Subject: [PATCH 102/236] init From 7d3502b5003ad54903b7a9e9aec5a853dfbe0221 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 08:20:55 +0000 Subject: [PATCH 103/236] Add -frecursive gfortran option by default --- Makefile.rule | 4 ++-- Makefile.system | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 6522b0777..d97607f2e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -192,8 +192,8 @@ NO_AFFINITY = 1 # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # COMMON_OPT = -O2 -# gfortran option for LAPACK -# enable this flag only on 64bit Linux and if you need a thread safe lapack library +# gfortran option for LAPACK to improve thread-safety +# It is enabled by default in Makefile.system for gfortran # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # FCOMMON_OPT = -frecursive diff --git a/Makefile.system b/Makefile.system index b4cd4222a..8de0b8f6e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -718,6 +718,8 @@ endif ifeq ($(F_COMPILER), GFORTRAN) CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall +# make single-threaded LAPACK calls thread-safe #1847 +FCOMMON_OPT += -frecursive #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran From a931afe269efc21a6710376254fb14d7bed085d8 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 09:39:05 +0000 Subject: [PATCH 104/236] init From 3fd41313fc2c36ea55a5e3aaf02cf2734f2d18c5 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 09:40:13 +0000 Subject: [PATCH 105/236] add low bound for number of buffers --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 6c3d5b15e..60da2416a 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ From 40cce0e353ca21ed1d045b4fc58faddd2ff6c2a7 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 09:45:49 +0000 Subject: [PATCH 106/236] handle cmake too --- cmake/fc.cmake | 2 +- common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 38d59f956..adec28a91 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -44,7 +44,7 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") diff --git a/common.h b/common.h index 6c3d5b15e..60da2416a 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ From 9531d0e1757dc0edd64c5c439d65fb236195410a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 17:51:24 +0000 Subject: [PATCH 107/236] lets fit it in one 4k page --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 60da2416a..7fcd5e316 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) +#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ From cfb0f5b0f82e67cf3cc854c8319ddb79ecd1366c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Nov 2018 22:39:10 +0100 Subject: [PATCH 108/236] Set LIBSONAME suffix to .a for AIX another fix for #1803 --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 7847c7525..716bd18e2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1211,7 +1211,11 @@ endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a +ifneq ($(OSNAME), AIX) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) +else +LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) +endif LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) From 0427277ceff6e477e06d98abe03e0b2348d6d26a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Nov 2018 15:45:54 +0100 Subject: [PATCH 109/236] Allow optimization for small m, large n only if it can be made threadsafe otherwise the introduction of a static array in 8e5a108 to improve #532 breaks concurrent calls from multiple threads as seen in #1844 --- driver/level2/gemv_thread.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index 061454848..fc4e4f7fe 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -62,9 +62,36 @@ #endif #endif -#ifndef TRANSA +#ifndef thread_local +# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__ +# define thread_local _Thread_local +# elif defined _WIN32 && ( \ + defined _MSC_VER || \ + defined __ICL || \ + defined __DMC__ || \ + defined __BORLANDC__ ) +# define thread_local __declspec(thread) +/* note that ICC (linux) and Clang are covered by __GNUC__ */ +# elif defined __GNUC__ || \ + defined __SUNPRO_C || \ + defined __xlC__ +# define thread_local __thread +# else +# define UNSAFE +#endif +#endif +#if defined USE_OPENMP +#undef UNSAFE +#endif + +#if !defined(TRANSA) && !defined(UNSAFE) #define Y_DUMMY_NUM 1024 +#if defined(USE_OPENMP) static FLOAT y_dummy[Y_DUMMY_NUM]; +#pragma omp threadprivate(y_dummy) +# else +static thread_local FLOAT y_dummy[Y_DUMMY_NUM]; +# endif #endif static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ @@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifdef TRANSA y += n_from * incy * COMPSIZE; #else +# ifndef UNSAFE //for split matrix row (n) direction and vector x of gemv_n x += n_from * incx * COMPSIZE; //store partial result for every thread y += (m_to - m_from) * 1 * COMPSIZE * pos; +# endif #endif } @@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x BLASLONG width, i, num_cpu; -#ifndef TRANSA +#if !defined(TRANSA) && !defined(iUNSAFE) int split_x=0; #endif @@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x i -= width; } -#ifndef TRANSA +#if !defined(TRANSA) && !defined(UNSAFE) //try to split matrix on row direction and x. //Then, reduction. if (num_cpu < nthreads) { @@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x exec_blas(num_cpu, queue); } -#ifndef TRANSA +#if !defined(TRANSA) && !defined(UNSAFE) if(split_x==1){ //reduction for(i=0; i Date: Sat, 10 Nov 2018 17:16:53 +0100 Subject: [PATCH 110/236] Fix argument in SLASET call to zero S fixes #1859 in accordance with https://github.com/LAPACK-Reference/issue/296 --- lapack-netlib/SRC/sgelss.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index 29380d4dc..84a882d2e 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -407,7 +407,7 @@ * Matrix all zero. Return zero solution. * CALL SLASET( 'F', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - CALL SLASET( 'F', MINMN, 1, ZERO, ZERO, S, 1 ) + CALL SLASET( 'F', MINMN, 1, ZERO, ZERO, S, MINMN ) RANK = 0 GO TO 70 END IF From e3666931d8b54f0bf918e45bc3da6ce51ea2a52a Mon Sep 17 00:00:00 2001 From: Arda Aytekin Date: Fri, 9 Nov 2018 00:25:30 +0100 Subject: [PATCH 111/236] Update .travis.yml Updated `.travis.yml` file to add emulated tests for `ARMV6` and `ARMV8` architectures with `gcc` and `clang`. Created prebuilt images with required dependencies. Squashed layers into one. --- .travis.yml | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index a0af0472e..4efa23b8d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,11 +4,10 @@ dist: precise sudo: true language: c -jobs: +matrix: include: - &test-ubuntu os: linux - stage: test compiler: gcc addons: apt: @@ -59,7 +58,6 @@ jobs: - BTYPE="BINARY=32" - os: linux - stage: test compiler: gcc addons: apt: @@ -80,7 +78,6 @@ jobs: # that don't require sudo. - &test-alpine os: linux - stage: test dist: trusty sudo: true language: minimal @@ -124,7 +121,6 @@ jobs: - &test-cmake os: linux - stage: test compiler: clang addons: apt: @@ -153,7 +149,6 @@ jobs: - &test-macos os: osx - stage: test osx_image: xcode8 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" @@ -168,6 +163,42 @@ jobs: env: - BTYPE="BINARY=32" + - &emulated-arm + dist: trusty + sudo: required + services: docker + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + name: "Emulated Build for ARMV6 with gcc" + before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset + script: | + echo "FROM openblas/alpine:${IMAGE_ARCH} + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=${TARGET_ARCH} \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + - <<: *emulated-arm + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + name: "Emulated Build for ARMV6 with clang" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + name: "Emulated Build for ARMV8 with gcc" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + name: "Emulated Build for ARMV8 with clang" + + allow_failures: + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + # whitelist branches: only: From 807f6e6922d7b7c53f79171e5224d11368c28235 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Nov 2018 18:52:29 +0100 Subject: [PATCH 112/236] Use prtconf to determine CPU type on AIX for #1803 --- cpuid_power.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cpuid_power.c b/cpuid_power.c index 6c7baef4a..ebd9e151e 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -127,6 +127,33 @@ int detect(void){ #endif #ifdef _AIX + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = popen("prtconf|grep 'Processor Type'"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("Pro", buffer, 3)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + pclose(infile); + + if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; + if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; + if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; + if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; + if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; + if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; + return CPUTYPE_POWER5; #endif From 2f04cf22accecc0befcc00fbb77dfc76e0506c84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Nov 2018 08:16:14 +0100 Subject: [PATCH 113/236] Detect POWER9 as POWER8 on AIX and Linux (already supported by the *BSD version) --- cpuid_power.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index ebd9e151e..afc94d2d5 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -56,6 +56,7 @@ #define CPUTYPE_CELL 6 #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 +#define CPUTYPE_POWER9 9 char *cpuname[] = { "UNKNOWN", @@ -66,7 +67,8 @@ char *cpuname[] = { "POWER6", "CELL", "PPCG4", - "POWER8" + "POWER8", + "POWER9" }; char *lowercpuname[] = { @@ -78,7 +80,8 @@ char *lowercpuname[] = { "power6", "cell", "ppcg4", - "power8" + "power8", + "power9" }; char *corename[] = { @@ -90,7 +93,8 @@ char *corename[] = { "POWER6", "CELL", "PPCG4", - "POWER8" + "POWER8", + "POWER8" }; int detect(void){ @@ -120,6 +124,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -151,9 +156,9 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; - return CPUTYPE_POWER5; #endif From c171b8ad13054518869cdc54db5af5cf6b886089 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Nov 2018 13:57:18 +0100 Subject: [PATCH 114/236] Handle special case INCX=0,INCY=0 in the axpy interface --- interface/axpy.c | 5 +++++ interface/zaxpy.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/interface/axpy.c b/interface/axpy.c index 39edea6af..9032946d2 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc if (alpha == ZERO) return; + if (incx == 0 && incy == 0) { + *y += n * alpha *(*x); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 1a0259c96..dbd559628 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -82,6 +82,12 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + if (incx == 0 && incy == 0) { + *y += n * (alpha_r * (*x) - alpha_i* (*(x+1)) ); + *(y+1) += n * (alpha_i * (*x) + alpha_r * (*(x +1)) ); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From 43bb386b10d94b341d5c8a27b5634081bb87de7f Mon Sep 17 00:00:00 2001 From: fengruilin Date: Thu, 15 Nov 2018 11:11:59 +0800 Subject: [PATCH 115/236] fix dot problem on 64bit mips --- kernel/mips64/KERNEL | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index e257dcfc9..3804b245d 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -6,6 +6,11 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S From 42bc2a92023070ee871ffd81b6a9b8fb6dd1892b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Nov 2018 12:10:44 +0100 Subject: [PATCH 116/236] Fix copy-paste errors (POWER8/9 and extraneous return) --- cpuid_power.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index afc94d2d5..fc36f8e2c 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -156,7 +156,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; @@ -180,7 +180,7 @@ int id; id = __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 - return return CPUTYPE_POWER8; + return CPUTYPE_POWER8; break; case 0x4d: case 0x4b: // POWER8/8E From 368d14f8c8b2eb2916d7cd6765f40c5aa31e2184 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Nov 2018 14:58:28 +0100 Subject: [PATCH 117/236] Fix harmless typo fixes #1872 --- driver/level2/gemv_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index fc4e4f7fe..d57740314 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x BLASLONG width, i, num_cpu; -#if !defined(TRANSA) && !defined(iUNSAFE) +#if !defined(TRANSA) && !defined(UNSAFE) int split_x=0; #endif From 2e6fae2aad240fe6be8273cc53bc239ee920ee7c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Nov 2018 14:02:50 +0100 Subject: [PATCH 118/236] Serialize accesses to parallelized level3 functions from multiple callers for #1851 --- driver/level3/level3_thread.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..15cad9274 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -514,6 +514,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { +#ifndef USE_OPENMP +static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#endif + blas_arg_t newarg; #ifndef USE_ALLOC_HEAP @@ -554,6 +558,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #endif #endif +#ifndef USE_OPENMP +pthread_mutex_lock(&level3_lock); +#endif + #ifdef USE_ALLOC_HEAP /* Dynamically allocate workspace */ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); @@ -671,6 +679,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG free(job); #endif +#ifndef USE_OPENMP + pthread_mutex_unlock(&level3_lock); +#endif + return 0; } From 310ea55f29f16771438386fb2f1f140e2fd7e397 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Fri, 16 Nov 2018 15:45:12 +0000 Subject: [PATCH 119/236] Simplifying ARMv8 build parameters ARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode (which is not right because TX2 is ARMv8.1) as well as requiring a few redundancies in the defines, making it harder to maintain and understand what core has what. A few other minor issues were also fixed. Tests were made on the following cores: A53, A57, A72, Falkor, ThunderX, ThunderX2, and XGene. Tests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester. A summary: * Removed TX2 code from ARMv8 build, to make sure it is compatible with all ARMv8 cores, not just v8.1. Also, the TX2 code has actually harmed performance on big cores. * Commoned up ARMv8 architectures' defines in params.h, to make sure that all will benefit from ARMv8 settings, in addition to their own. * Adding a few more cores, using ARMv8's include strategy, to benefit from compiler optimisations using mtune. Also updated cache information from the manuals, making sure we set good conservative values by default. Removed Vulcan, as it's an alias to TX2. * Auto-detecting most of those cores, but also updating the forced compilation in getarch.c, to make sure the parameters are the same whether compiled natively or forced arch. Benefits: * ARMv8 build is now guaranteed to work on all ARMv8 cores * Improved performance for ARMv8 builds on some cores (A72, Falkor, ThunderX1 and 2: up to 11%) over current develop * Improved performance for *all* cores comparing to develop branch before TX2's patch (9% ~ 36%) * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than current develop's branch and 8% faster than deveop before tx2 patches Issues: * Regression from current develop branch for A53 (-12%) and A57 (-3%) with ARMv8 builds, but still faster than before TX2's commit (+15% and +24% respectively). This can be improved with a simplification of TX2's code, to be done in future patches. At least the code is guaranteed to be ARMv8.0 now. Comments: * CortexA57 builds are unchanged on A57 hardware from develop's branch, which makes sense, as it's untouched. * CortexA72 builds improve over A57 on A72 hardware, even if they're using the same includes due to new compiler tunning in the makefile. --- Makefile.arm64 | 35 ++++++--- TargetList.txt | 5 +- cpuid_arm64.c | 126 +++++++++++++++++-------------- getarch.c | 78 +++++++++++++++---- kernel/arm64/KERNEL.ARMV8 | 68 +++++++---------- kernel/arm64/KERNEL.CORTEXA53 | 3 + kernel/arm64/KERNEL.CORTEXA72 | 3 + kernel/arm64/KERNEL.CORTEXA73 | 3 + kernel/arm64/KERNEL.FALKOR | 3 + kernel/arm64/KERNEL.VULCAN | 3 - param.h | 136 ++++++++++++++-------------------- 11 files changed, 256 insertions(+), 207 deletions(-) create mode 100644 kernel/arm64/KERNEL.CORTEXA53 create mode 100644 kernel/arm64/KERNEL.CORTEXA72 create mode 100644 kernel/arm64/KERNEL.CORTEXA73 create mode 100644 kernel/arm64/KERNEL.FALKOR delete mode 100644 kernel/arm64/KERNEL.VULCAN diff --git a/Makefile.arm64 b/Makefile.arm64 index d19e796a5..a529fab80 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a endif -ifeq ($(CORE), CORTEXA57) -CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 -FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 +ifeq ($(CORE), CORTEXA53) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 endif -ifeq ($(CORE), VULCAN) -CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan -FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan +ifeq ($(CORE), CORTEXA57) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +endif + +ifeq ($(CORE), CORTEXA72) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif + +ifeq ($(CORE), CORTEXA73) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif ifeq ($(CORE), THUNDERX) -CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx -FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx +CCOMMON_OPT += -march=armv8-a -mtune=thunderx +FCOMMON_OPT += -march=armv8-a -mtune=thunderx +endif + +ifeq ($(CORE), FALKOR) +CCOMMON_OPT += -march=armv8.1-a -mtune=falkor +FCOMMON_OPT += -march=armv8.1-a -mtune=falkor endif ifeq ($(CORE), THUNDERX2T99) -CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 -FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif diff --git a/TargetList.txt b/TargetList.txt index 31e4881c4..3d04a57cf 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -83,8 +83,11 @@ ARMV5 8.ARM 64-bit CPU: ARMV8 +CORTEXA53 CORTEXA57 -VULCAN +CORTEXA72 +CORTEXA73 +FALKOR THUNDERX THUNDERX2T99 diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 3acb395b5..c914fbc2b 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -29,25 +29,37 @@ #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 -#define CPU_CORTEXA57 2 -#define CPU_VULCAN 3 -#define CPU_THUNDERX 4 -#define CPU_THUNDERX2T99 5 +// Arm +#define CPU_CORTEXA53 2 +#define CPU_CORTEXA57 3 +#define CPU_CORTEXA72 4 +#define CPU_CORTEXA73 5 +// Qualcomm +#define CPU_FALKOR 6 +// Cavium +#define CPU_THUNDERX 7 +#define CPU_THUNDERX2T99 8 static char *cpuname[] = { "UNKNOWN", "ARMV8" , + "CORTEXA53", "CORTEXA57", - "VULCAN", + "CORTEXA72", + "CORTEXA73", + "FALKOR", "THUNDERX", "THUNDERX2T99" }; static char *cpuname_lower[] = { "unknown", - "armv8" , + "armv8", + "cortexa53", "cortexa57", - "vulcan", + "cortexa72", + "cortexa73", + "falkor", "thunderx", "thunderx2t99" }; @@ -114,14 +126,24 @@ int detect(void) fclose(infile); if(cpu_part != NULL && cpu_implementer != NULL) { - if (strstr(cpu_implementer, "0x41") && - (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08"))) - return CPU_CORTEXA57; //or compatible, ex. A72 - else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) - return CPU_VULCAN; - else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) + // Arm + if (strstr(cpu_implementer, "0x41")) { + if (strstr(cpu_part, "0xd03")) + return CPU_CORTEXA53; + else if (strstr(cpu_part, "0xd07")) + return CPU_CORTEXA57; + else if (strstr(cpu_part, "0xd08")) + return CPU_CORTEXA72; + else if (strstr(cpu_part, "0xd09")) + return CPU_CORTEXA73; + } + // Qualcomm + else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) + return CPU_FALKOR; + // Cavium + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1")) return CPU_THUNDERX; - else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43")) + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; } @@ -180,62 +202,62 @@ void get_subdirname(void) void get_cpuconfig(void) { + // All arches should define ARMv8 + printf("#define ARMV8\n"); + printf("#define HAVE_NEON\n"); // This shouldn't be necessary + printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary + int d = detect(); switch (d) { + case CPU_CORTEXA53: + printf("#define %s\n", cpuname[d]); + // Fall-through case CPU_ARMV8: - printf("#define ARMV8\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); - break; - - case CPU_VULCAN: - printf("#define VULCAN \n"); - printf("#define HAVE_VFP \n"); - printf("#define HAVE_VFPV3 \n"); - printf("#define HAVE_NEON \n"); - printf("#define HAVE_VFPV4 \n"); - printf("#define L1_CODE_SIZE 32768 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 262144 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 33554432 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); + // Minimum parameters for ARMv8 (based on A53) + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); break; case CPU_CORTEXA57: - printf("#define CORTEXA57\n"); - printf("#define HAVE_VFP\n"); - printf("#define HAVE_VFPV3\n"); - printf("#define HAVE_NEON\n"); - printf("#define HAVE_VFPV4\n"); + case CPU_CORTEXA72: + case CPU_CORTEXA73: + // Common minimum settings for these Arm cores + // Can change a lot, but we need to be conservative + // TODO: detect info from /sys if possible + printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 49152\n"); printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_ASSOCIATIVE 3\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_ASSOCIATIVE 2\n"); - printf("#define L2_SIZE 2097152\n"); + printf("#define L2_SIZE 524288\n"); printf("#define L2_LINESIZE 64\n"); printf("#define L2_ASSOCIATIVE 16\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_FALKOR: + printf("#define FALKOR\n"); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; + case CPU_THUNDERX: printf("#define THUNDERX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -249,10 +271,6 @@ void get_cpuconfig(void) case CPU_THUNDERX2T99: printf("#define VULCAN \n"); - printf("#define HAVE_VFP \n"); - printf("#define HAVE_VFPV3 \n"); - printf("#define HAVE_NEON \n"); - printf("#define HAVE_VFPV4 \n"); printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n"); diff --git a/getarch.c b/getarch.c index 31f41d62c..146f1f36f 100644 --- a/getarch.c +++ b/getarch.c @@ -927,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "armv8" #define CORENAME "ARMV8" #endif +#ifdef FORCE_CORTEXA53 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA53" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA53 " \ + "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa53" +#define CORENAME "CORTEXA53" +#else +#endif + #ifdef FORCE_CORTEXA57 #define FORCE #define ARCHITECTURE "ARM64" @@ -942,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa57" #define CORENAME "CORTEXA57" #else #endif -#ifdef FORCE_VULCAN +#ifdef FORCE_CORTEXA72 #define FORCE #define ARCHITECTURE "ARM64" -#define SUBARCHITECTURE "VULCAN" +#define SUBARCHITECTURE "CORTEXA72" #define SUBDIRNAME "arm64" -#define ARCHCONFIG "-DVULCAN " \ - "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ - "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ - "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ - "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ +#define ARCHCONFIG "-DCORTEXA72 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" -#define LIBNAME "vulcan" -#define CORENAME "VULCAN" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa72" +#define CORENAME "CORTEXA72" +#else +#endif + +#ifdef FORCE_CORTEXA73 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA73" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA73 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa73" +#define CORENAME "CORTEXA73" +#else +#endif + +#ifdef FORCE_FALKOR +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "FALKOR" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DFALKOR " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "falkor" +#define CORENAME "FALKOR" #else #endif @@ -973,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DTHUNDERX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx" #define CORENAME "THUNDERX" #else #endif #ifdef FORCE_THUNDERX2T99 +#define ARMV8 #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "THUNDERX2T99" @@ -990,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx2t99" #define CORENAME "THUNDERX2T99" #else diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index bcecd0026..5c70390dc 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -46,7 +46,7 @@ CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S +DAXPYKERNEL = axpy.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S @@ -71,39 +71,37 @@ CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S ifneq ($(OS_DARWIN)$(CROSS),11) -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +CNRM2KERNEL = nrm2.S +DNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S endif -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c +DDOTKERNEL = dot.S +SDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S ifneq ($(OS_DARWIN)$(CROSS),11) @@ -175,22 +173,6 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) -DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) -SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S -endif - -ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) -CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) -ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S -endif - else STRMMKERNEL = ../generic/trmmkernel_2x2.c diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 new file mode 100644 index 000000000..c1d33fa3e --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.ARMV8 + + diff --git a/kernel/arm64/KERNEL.CORTEXA72 b/kernel/arm64/KERNEL.CORTEXA72 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA72 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.CORTEXA73 b/kernel/arm64/KERNEL.CORTEXA73 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA73 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.FALKOR b/kernel/arm64/KERNEL.FALKOR new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.FALKOR @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.VULCAN b/kernel/arm64/KERNEL.VULCAN deleted file mode 100644 index 8b0273951..000000000 --- a/kernel/arm64/KERNEL.VULCAN +++ /dev/null @@ -1,3 +0,0 @@ -include $(KERNELDIR)/KERNEL.THUNDERX2T99 - - diff --git a/param.h b/param.h index d1b211584..8f56cdaaa 100644 --- a/param.h +++ b/param.h @@ -2543,49 +2543,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif - -#if defined(CORTEXA57) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SGEMM_DEFAULT_UNROLL_M 16 -#define SGEMM_DEFAULT_UNROLL_N 4 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 - -#define SGEMM_DEFAULT_P 512 -#define DGEMM_DEFAULT_P 256 -#define CGEMM_DEFAULT_P 256 -#define ZGEMM_DEFAULT_P 128 - -#define SGEMM_DEFAULT_Q 1024 -#define DGEMM_DEFAULT_Q 512 -#define CGEMM_DEFAULT_Q 512 -#define ZGEMM_DEFAULT_Q 512 - -#define SGEMM_DEFAULT_R 4096 -#define DGEMM_DEFAULT_R 4096 -#define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 2048 - - -#define SYMV_P 16 -#endif - +// Common ARMv8 parameters #if defined(ARMV8) -#if defined(OS_DARWIN) && defined(CROSS) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2593,6 +2553,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#define SYMV_P 16 + +// Darwin / Cross +#if defined(OS_DARWIN) && defined(CROSS) + #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2620,15 +2585,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SYMV_P 16 -#else +#else // Linux / Native -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#if defined(CORTEXA53) || defined(CORTEXA57) || \ + defined(CORTEXA72) || defined(CORTEXA73) || \ + defined(FALKOR) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2642,33 +2603,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 -#define CGEMM_DEFAULT_P 128 +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 112 +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 512 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 512 #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 2048 -#define SYMV_P 16 -#endif - -#endif - -#if defined(THUNDERX) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#elif defined(THUNDERX) #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2697,17 +2647,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 - -#define SYMV_P 16 -#endif - -#if defined(THUNDERX2T99) || defined(VULCAN) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#elif defined(THUNDERX2T99) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2736,8 +2676,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SYMV_P 16 -#endif +#else // Other/undetected ARMv8 cores + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#endif // Cores + +#endif // Linux / Darwin + +#endif // ARMv8 #if defined(ARMV5) #define SNUMOPT 2 From 5192651706d39b35e82b6f62f2b02764cdb3983c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Nov 2018 17:58:22 +0100 Subject: [PATCH 120/236] Add CriticalSection handling instead of mutexes for Windows --- driver/level3/level3_thread.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 15cad9274..ac96f9424 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -515,7 +515,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG nthreads_m, BLASLONG nthreads_n) { #ifndef USE_OPENMP +#ifndef OS_WINDOWS static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#else +CRITICAL_SECTION level3_lock; +InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock; +#endif #endif blas_arg_t newarg; @@ -559,7 +564,11 @@ static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; #endif #ifndef USE_OPENMP +#ifndef OS_WINDOWS pthread_mutex_lock(&level3_lock); +#else +EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif #endif #ifdef USE_ALLOC_HEAP @@ -680,7 +689,11 @@ pthread_mutex_lock(&level3_lock); #endif #ifndef USE_OPENMP +#ifndef OS_WINDOWS pthread_mutex_unlock(&level3_lock); +#else + LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif #endif return 0; From 113cb00b95626d037647107aaa1f00027772b0da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Nov 2018 21:01:36 +0100 Subject: [PATCH 121/236] fix missing parenthesis --- driver/level3/level3_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ac96f9424..3411a3e9b 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -519,7 +519,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; #else CRITICAL_SECTION level3_lock; -InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock; +InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); #endif #endif From 0184713e1a2c3ae99f500edce105ab0f42e96de6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Bissey?= Date: Wed, 21 Nov 2018 14:24:56 +1300 Subject: [PATCH 122/236] Correct link flags for PGI compiler. --- f_check | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/f_check b/f_check index 997e02393..34caa00be 100644 --- a/f_check +++ b/f_check @@ -292,9 +292,6 @@ if ($link ne "") { && ($flags !~ /^-LIST:/) && ($flags !~ /^-LANG:/) ) { - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= $flags . " "; } @@ -311,17 +308,11 @@ if ($link ne "") { if ($flags =~ /^\-rpath\@/) { $flags =~ s/\@/\,/g; - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= "-Wl,". $flags . " " ; } if ($flags =~ /^\-rpath-link\@/) { $flags =~ s/\@/\,/g; - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= "-Wl,". $flags . " " ; } @@ -330,7 +321,6 @@ if ($link ne "") { && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) - && ($flags !~ /numa/) && ($flags !~ /crt[0-9]/) && ($flags !~ /gcc/) && ($flags !~ /user32/) From f049a4c84f5854d72ac3c06d9867c1b46d7e8bff Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 25 Nov 2018 21:34:09 +0100 Subject: [PATCH 123/236] init From 19c4bdd8b3f3fc5a97a5b756f6590bdb6d2a3ee9 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 25 Nov 2018 21:35:01 +0100 Subject: [PATCH 124/236] Add return value so that freebsd system clang does not err out --- kernel/x86_64/sgemm_beta_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 4e40acadf..498c46f0d 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, } if (n == 0 || m == 0) - return; + return 0; c_offset = c; From 816775e3099cba07b4ad2636090c1f752d9f8b3e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 00:06:44 +0100 Subject: [PATCH 125/236] Add version information to openblas_get_config output --- driver/others/openblas_get_config.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 3e87f2cc2..471be21bc 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif static char* openblas_config_str="" +"VERSION " + VERSION #ifdef USE64BITINT - "USE64BITINT " + " USE64BITINT " #endif #ifdef NO_CBLAS "NO_CBLAS " From a29ec458c238a9b1183baaf6d5c99d14d206987a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 00:10:49 +0100 Subject: [PATCH 126/236] propagate verison number for openblas_config_version --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index 1427d09fb..22fe24337 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1036,6 +1036,8 @@ ifdef USE_TLS CCOMMON_OPT += -DUSE_TLS endif +CCOMMON_OPT += -DVERSION=\"$(VERSION)\" + ifndef SYMBOLPREFIX SYMBOLPREFIX = endif From 081ceb3e029e04b3a2773915cc67dc848bab3ef2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 00:12:04 +0100 Subject: [PATCH 127/236] Propagate version number for openblas_get_config --- cmake/system.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 61f96edb0..d803bb9eb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -310,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") endif () +set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"") + set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) From de0d0ed52f314a6b370fab03bc21ebbb3d943bbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 11:28:19 +0100 Subject: [PATCH 128/236] Improve formatting of config output --- driver/others/openblas_get_config.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 471be21bc..4f22325b6 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -44,6 +44,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static char* openblas_config_str="" "VERSION " VERSION +" " #ifdef USE64BITINT " USE64BITINT " #endif From 97d72989739163171930046dba8d7a3214f49b9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 11:52:08 +0100 Subject: [PATCH 129/236] call it OpenBLAS not just version --- driver/others/openblas_get_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 4f22325b6..eca494dca 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif static char* openblas_config_str="" -"VERSION " +"OpenBLAS " VERSION " " #ifdef USE64BITINT From 7a2e1bc8041a898cadea475a0562e5b40ec49750 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Nov 2018 10:57:09 +0100 Subject: [PATCH 130/236] Use generic kernel for DSDOT/SDSDOT as discussed in #1834 --- kernel/mips64/KERNEL.LOONGSON3A | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 2d03ad7fa..0298faaad 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -63,6 +63,7 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DSDOTKERNEL = ../mips/dot.c From 95a5542e3c21def6e63e9de8b5c1850830fc0289 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Nov 2018 11:16:24 +0100 Subject: [PATCH 131/236] Revert DOT kernel changes from #1834 as the failures seen on Loongson3A appear to be limited to DSDOT/SDSDOT (i.e. my hackish "fix" from #1684) --- kernel/mips64/KERNEL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index e257dcfc9..f77ca19ed 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,9 +1,9 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c -SROTKERNEL = ../mips/rot.c -DROTKERNEL = ../mips/rot.c -CROTKERNEL = ../mips/zrot.c -ZROTKERNEL = ../mips/zrot.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c From 2601cd58ab55d0b76c305bde1d320b8ab0da25ed Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Thu, 11 Oct 2018 23:29:34 +0300 Subject: [PATCH 132/236] remove surplus locking code , only enabled w x86, disabled or never enabled on all others --- driver/others/memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 25f198623..36815a39c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2586,7 +2586,7 @@ void *blas_memory_alloc(int procpos){ printf("Alloc Start ...\n"); #endif -#if defined(WHEREAMI) && !defined(USE_OPENMP) +/* #if defined(WHEREAMI) && !defined(USE_OPENMP) mypos = WhereAmI(); @@ -2596,12 +2596,12 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { LOCK_COMMAND(&alloc_lock); -/* blas_lock(&memory[position].lock);*/ +// blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ +// blas_unlock(&memory[position].lock); } position ++; @@ -2609,7 +2609,7 @@ void *blas_memory_alloc(int procpos){ } while (position < NUM_BUFFERS); -#endif +#endif */ position = 0; From f85ce54d4a2c23b27d80ec454e150b5388d5d38c Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 30 Nov 2018 16:05:49 +0000 Subject: [PATCH 133/236] Use correct Makefile on powerpc64 FreeBSD uses powerpc64 name for POWER architecture. Use correct Makefile for this platform. --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 22fe24337..bf2b76fae 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1060,7 +1060,11 @@ endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) +ifneq ($(ARCH), powerpc64) include $(TOPDIR)/Makefile.$(ARCH) +else +include $(TOPDIR)/Makefile.power +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" From 731b2722ba4ba25d982682e47cbad0b780bd24d3 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 30 Nov 2018 16:04:07 +0000 Subject: [PATCH 134/236] Fix build on POWER, remove DragonFly, add NetBSD __asm is complete on its own DBSD developers state they will only support amd64, but NetBSD supports POWER. --- cpuid_power.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index fc36f8e2c..23e98ebb0 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -175,9 +175,9 @@ int detect(void){ return CPUTYPE_PPC970; #endif -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) int id; -id = __asm __volatile("mfpvr %0" : "=r"(id)); +__asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 return CPUTYPE_POWER8; From 6c7b69108300511f4b4bece422c62a7e4ff89d87 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Nov 2018 21:32:01 +0100 Subject: [PATCH 135/236] Really revert xDOT changes from 1832 neglected to rebase #1892 on merging --- kernel/mips64/KERNEL | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 3a26b820c..61da7445f 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -6,12 +6,8 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c -SDOTKERNEL = ../mips/dot.c -DDOTKERNEL = ../mips/dot.c -CDOTKERNEL = ../mips/zdot.c -ZDOTKERNEL = ../mips/zdot.c - - + + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif From dceff5542ce5aaf9b0a7198612c7fdf36228f3bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Dec 2018 20:56:11 +0100 Subject: [PATCH 136/236] Handle Android environments that identify as Linux (#1898) * Handle Android environments that identify as Linux termux terminal emulator does this, causing build failures through missed defines in common.h --- cmake/system_check.cmake | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index fe30c7600..6b602c1b0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS") set(HOST_OS WINNT) endif () +if (${HOST_OS} STREQUAL "LINUX") +# check if we're building natively on Android (TERMUX) + EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) + if(${OPERATING_SYSTEM} MATCHES "Android") + set(HOST_OS ANDROID) + endif(${OPERATING_SYSTEM} MATCHES "Android") +endif() + + + if(CMAKE_COMPILER_IS_GNUCC AND WIN32) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE From 84e614d0fd672fa3f11fba2aed2a8833d2c7aea8 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 2 Dec 2018 12:05:15 +0100 Subject: [PATCH 137/236] init From 26b3710485dbcd614f352713a2fc2637741fa25a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 2 Dec 2018 12:07:41 +0100 Subject: [PATCH 138/236] Add architecture mappings for FreeBSD12 --- Makefile.system | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 1427d09fb..42f446996 100644 --- a/Makefile.system +++ b/Makefile.system @@ -12,7 +12,13 @@ endif # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 -endif +else ifeq ($(ARCH), powerpc64) +override ARCH=power +else ifeq (($ARCH), i386) +override ARCH=x86 +else ifeq ($(ARCH), aarch64) +override ARCH=arm64 +endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib From 44c81fd1355cef9b07189ebaad061709be0cd7c6 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 2 Dec 2018 20:27:53 +0100 Subject: [PATCH 139/236] oops --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 42f446996..25ac38dc0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -14,7 +14,7 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power -else ifeq (($ARCH), i386) +else ifeq ($(ARCH), i386) override ARCH=x86 else ifeq ($(ARCH), aarch64) override ARCH=arm64 From 3c9e3faedb1d861dc094ebff0c508c679c4a3cb8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:24:53 +0100 Subject: [PATCH 140/236] fixup BSD naming of powerpc arch --- Makefile.system | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index bf2b76fae..6919c0114 100644 --- a/Makefile.system +++ b/Makefile.system @@ -11,7 +11,11 @@ endif # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) -override ARCH=x86_64 +override ARCH=x86_64 +else ifeq ($(ARCH), powerpc64) +override ARCH=power +endif + endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib @@ -1060,11 +1064,7 @@ endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) -ifneq ($(ARCH), powerpc64) include $(TOPDIR)/Makefile.$(ARCH) -else -include $(TOPDIR)/Makefile.power -endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" From c0827a716473bd61d3e8fa44c25184d370400267 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:41:17 +0100 Subject: [PATCH 141/236] Update with changes from 0.3.4 --- Changelog.txt | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index faecd82e3..0dd17a558 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,77 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.4 +02-Dec-2018 + +common: + * the new, experimental thread-local memory allocation had + inadvertently been left enabled for gmake builds in 0.3.3 + despite the announcement. It is now disabled by default, and + single-threaded builds will keep using the old allocator even + if the USE_TLS option is turned on. + * OpenBLAS will now provide enough buffer space for at least 50 + threads by default. + * The output of openblas_get_config() now contains the version + number. + * A serious thread safety bug in GEMV operation with small M and + large N size has been fixed. + * The code will now automatically call blas_thread_init after a + fork if needed before handling a call to openblas_set_num_threads + * Accesses to parallelized level3 functions from multiple callers + are now serialized to avoid thread races (unless using OpenMP). + This should provide better performance than the known-threadsafe + (but non-default) USE_SIMPLE_THREADED_LEVEL3 option. + * When building LAPACK with gfortran, -frecursive is now (again) + enabled by default to ensure correct behaviour. + * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and + CBLAS_LAYOUT as the name of the matrix row/column order option. + * Externally set LDFLAGS are now passed through to the final compile/link + steps to facilitate setting platform-specific linker flags. + * A potential race condition during the build of LAPACK (that would + usually manifest itself as a failure to build TESTING/MATGEN) has been + fixed. + * xHEMV has been changed to stay single-threaded for small input sizes + where the overhead of multithreading exceeds any possible gains + * CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or + ThunderX hardware with sizable input. + * Linker flags for the PGI compiler have been updated + * Behaviour of AXPY with zero increments is now handled in the C interface, + correcting the result on at least Intel Atom. + * The result matrix from calling SGELSS with an all-zero input matrix is + now zeroed completely. + +x86_64: + * Autodetection of AMD Ryzen2 has been fixed (again). + * CMAKE builds now support labeling of an INTERFACE64=1 build of + the library with the _64 suffix. + * AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel + has been sped up by rewriting with C intrinsics + * Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS) + +POWER: + * added support for building on AIX (with gcc and GNU tools from AIX Toolbox). + * CPU type detection has been implemented for AIX. + * CPU type detection has been fixed for NETBSD. + +MIPS64: + * AXPY on LOONGSON3A has been corrected to pass "zero increment" utest. + * DSDOT on LOONGSON3A has been fixed. + * the SGEMM microkernel has been hardened against potential data loss. + +ARMV8: + * DYNAMic_ARCH support is now available for 64bit ARM + * cross-compiling for ARMV8 under iOS now works. + * cpu-specific code has been rearranged to make better use of both + hardware commonalities and model-specific compiler optimizations. + * XGENE1 has been removed as a TARGET, superseded by the improved generic + ARMV8 support. + +ARMV7: + * Older assembly mnemonics have been converted to UAL form to allow + building with clang 7.0 + * Cross compiling LAPACKE for Android has been fixed again (broken by + update to LAPACK 3.7.0 some while ago). + ==================================================================== Version 0.3.3 31-Aug-2018 From 93fa6b7b76ffbd56ffce54ac11467d580f53537c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:42:33 +0100 Subject: [PATCH 142/236] Increment version to 0.3.5.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 296113941..24c169afe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 4) +set(OpenBLAS_PATCH_VERSION 5.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From f5acaad8f0590502e26539917a0704e572e17abc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:43:15 +0100 Subject: [PATCH 143/236] Increment version to 0.3.5.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index f3086a01b..0d5b83b39 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.4 +VERSION = 0.3.5.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 360374be62cab8f5be8baecfa675da59a571608d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:44:13 +0100 Subject: [PATCH 144/236] Update with the changes from 0.3.4 --- Changelog.txt | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index faecd82e3..0dd17a558 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,77 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.4 +02-Dec-2018 + +common: + * the new, experimental thread-local memory allocation had + inadvertently been left enabled for gmake builds in 0.3.3 + despite the announcement. It is now disabled by default, and + single-threaded builds will keep using the old allocator even + if the USE_TLS option is turned on. + * OpenBLAS will now provide enough buffer space for at least 50 + threads by default. + * The output of openblas_get_config() now contains the version + number. + * A serious thread safety bug in GEMV operation with small M and + large N size has been fixed. + * The code will now automatically call blas_thread_init after a + fork if needed before handling a call to openblas_set_num_threads + * Accesses to parallelized level3 functions from multiple callers + are now serialized to avoid thread races (unless using OpenMP). + This should provide better performance than the known-threadsafe + (but non-default) USE_SIMPLE_THREADED_LEVEL3 option. + * When building LAPACK with gfortran, -frecursive is now (again) + enabled by default to ensure correct behaviour. + * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and + CBLAS_LAYOUT as the name of the matrix row/column order option. + * Externally set LDFLAGS are now passed through to the final compile/link + steps to facilitate setting platform-specific linker flags. + * A potential race condition during the build of LAPACK (that would + usually manifest itself as a failure to build TESTING/MATGEN) has been + fixed. + * xHEMV has been changed to stay single-threaded for small input sizes + where the overhead of multithreading exceeds any possible gains + * CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or + ThunderX hardware with sizable input. + * Linker flags for the PGI compiler have been updated + * Behaviour of AXPY with zero increments is now handled in the C interface, + correcting the result on at least Intel Atom. + * The result matrix from calling SGELSS with an all-zero input matrix is + now zeroed completely. + +x86_64: + * Autodetection of AMD Ryzen2 has been fixed (again). + * CMAKE builds now support labeling of an INTERFACE64=1 build of + the library with the _64 suffix. + * AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel + has been sped up by rewriting with C intrinsics + * Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS) + +POWER: + * added support for building on AIX (with gcc and GNU tools from AIX Toolbox). + * CPU type detection has been implemented for AIX. + * CPU type detection has been fixed for NETBSD. + +MIPS64: + * AXPY on LOONGSON3A has been corrected to pass "zero increment" utest. + * DSDOT on LOONGSON3A has been fixed. + * the SGEMM microkernel has been hardened against potential data loss. + +ARMV8: + * DYNAMic_ARCH support is now available for 64bit ARM + * cross-compiling for ARMV8 under iOS now works. + * cpu-specific code has been rearranged to make better use of both + hardware commonalities and model-specific compiler optimizations. + * XGENE1 has been removed as a TARGET, superseded by the improved generic + ARMV8 support. + +ARMV7: + * Older assembly mnemonics have been converted to UAL form to allow + building with clang 7.0 + * Cross compiling LAPACKE for Android has been fixed again (broken by + update to LAPACK 3.7.0 some while ago). + ==================================================================== Version 0.3.3 31-Aug-2018 From ea6d1b96bd3fdaf8e8b4d912bdd906cbcb9b1bbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Dec 2018 08:59:10 +0100 Subject: [PATCH 145/236] Update Makefile.system --- Makefile.system | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 6919c0114..3cf5a16b2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -16,8 +16,6 @@ else ifeq ($(ARCH), powerpc64) override ARCH=power endif -endif - NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # Default C compiler From 701ea88347461e4c5d896765438dc870281b3834 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Dec 2018 13:06:43 +0100 Subject: [PATCH 146/236] Use p2align instead of align for OSX compatibility fixes #1902 --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index a83ca98fa..6257e569e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -869,7 +869,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovapd %%zmm1, %%zmm27\n" "vmovapd %%zmm1, %%zmm28\n" "jmp .label24\n" - ".align 32\n" + ".p2align 5\n" /* Inner math loop */ ".label24:\n" "vmovupd -128(%[AO]),%%zmm0\n" @@ -1037,7 +1037,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovapd %%zmm1, %%zmm17\n" "vmovapd %%zmm1, %%zmm18\n" "jmp .label16\n" - ".align 32\n" + ".p2align 5\n" /* Inner math loop */ ".label16:\n" "vmovupd -128(%[AO]),%%zmm0\n" @@ -1165,7 +1165,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovapd %%zmm1, %%zmm8\n" "vbroadcastsd (%[alpha]), %%zmm9\n" "jmp .label1\n" - ".align 32\n" + ".p2align 5\n" /* Inner math loop */ ".label1:\n" "vmovupd -128(%[AO]),%%zmm0\n" From 31a490ea887dd078233aebffc5a57a093fe2d886 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Wed, 5 Dec 2018 18:51:38 +0000 Subject: [PATCH 147/236] Fix two mistakes on Arm64 builds * Falkor is an ARMv8.0 with ARMv8.1 features, and chosing armv8.1-a for march generates instructions it cannot cope with. Reverting it back to armv8-a. * ThunderX2's build was left with a #define VULCAN, which made it miss the right compiler flags in Makefile.arm64, although it did create the right library in the end. --- Makefile.arm64 | 4 ++-- cpuid_arm64.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index a529fab80..cd16dbfae 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -30,8 +30,8 @@ FCOMMON_OPT += -march=armv8-a -mtune=thunderx endif ifeq ($(CORE), FALKOR) -CCOMMON_OPT += -march=armv8.1-a -mtune=falkor -FCOMMON_OPT += -march=armv8.1-a -mtune=falkor +CCOMMON_OPT += -march=armv8-a -mtune=falkor +FCOMMON_OPT += -march=armv8-a -mtune=falkor endif ifeq ($(CORE), THUNDERX2T99) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index c914fbc2b..5077d7b11 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -270,7 +270,7 @@ void get_cpuconfig(void) break; case CPU_THUNDERX2T99: - printf("#define VULCAN \n"); + printf("#define THUNDERX2T99 \n"); printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n"); From 6ba30e270d0a6988e02f45cd0b5ef2b505c5619c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 13:42:25 +0100 Subject: [PATCH 148/236] Fix typo that broke CNRM2 on ARMV8 since 0.3.0 must have happened in my #1449 --- kernel/arm64/KERNEL.ARMV8 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 5c70390dc..07d6cee99 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -93,8 +93,8 @@ IZAMAXKERNEL = izamax.S ifneq ($(OS_DARWIN)$(CROSS),11) SNRM2KERNEL = nrm2.S -CNRM2KERNEL = nrm2.S -DNRM2KERNEL = znrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S endif From 2fc712469d1e29220e2e3f3f83d2ab7b17c0bc60 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 13:56:06 +0100 Subject: [PATCH 149/236] Avoid creating spurious non-suffixed c/zgemm_kernels Plain cgemm_kernel and zgemm_kernel are not used anywhere, only cgemm_kernel_b etc. Needlessly building them (without any define like NN, CN, etc.) just happened to work on most platforms, but not on arm64. See #1870 --- kernel/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 947114ebe..2a330df4e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -125,10 +125,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () - foreach (float_type ${FLOAT_TYPES}) + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) + endforeach() + foreach (float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char) if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () From 7639f2e1f004d441757a43bcdfff6c32611a2aa3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 14:04:27 +0100 Subject: [PATCH 150/236] Rewrite the conditional for OSX to fix cmake parsing on others The Makefile variable parser in utils.cmake currently does not handle conditionals. Having the definitions for non-OSX last will at least make cmake builds work again on non-OSX platforms. --- kernel/arm64/KERNEL.ARMV8 | 63 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 07d6cee99..a2a435738 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -104,8 +104,38 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S -ifneq ($(OS_DARWIN)$(CROSS),11) +ifeq ($(OS_DARWIN)$(CROSS),11) +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +else SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -173,35 +203,4 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -else - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - endif From 0b095166788b28dc9270edca2eb62ef2f201f6fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 18:33:05 +0100 Subject: [PATCH 151/236] Fix missing parameter in popen call --- cpuid_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_power.c b/cpuid_power.c index 23e98ebb0..82a3f4aac 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -136,7 +136,7 @@ int detect(void){ char buffer[512], *p; p = (char *)NULL; - infile = popen("prtconf|grep 'Processor Type'"); + infile = popen("prtconf|grep 'Processor Type'", "r"); while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("Pro", buffer, 3)){ p = strchr(buffer, ':') + 2; From 2b355592e34b07f4d0c5f81c275c902c0578236d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 16:25:55 +0100 Subject: [PATCH 152/236] Make sure to use the arm version of dynamic.c in ARM64 DYNAMIC_ARCH cf. #1908 --- driver/others/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index e20b14e79..f7cce4d46 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1) GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) if (DYNAMIC_ARCH) - list(APPEND COMMON_SOURCES dynamic.c) + if (ARM64) + list(APPEND COMMON_SOURcES dynamic_arm64.c) + else () + list(APPEND COMMON_SOURCES dynamic.c) + endif () else () list(APPEND COMMON_SOURCES parameter.c) endif () From 133c278ee565e91ff65d627b363aee36b71feeba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 17:42:23 +0100 Subject: [PATCH 153/236] Add DYNAMIC_CORE list for ARM64 cf #1908 --- cmake/arch.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 52fb64eaa..63fb86fa2 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,6 +44,10 @@ endif () if (DYNAMIC_ARCH) + if (ARM64) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) + endif () + if (X86) set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) endif () From 0bf6d74e5f9855ddf2028dcc099ee58e4f13446b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 19:37:33 +0100 Subject: [PATCH 154/236] Fix typo in previous commit for arm dynamic arch --- driver/others/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index f7cce4d46..a07e00b3b 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -48,7 +48,7 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" if (DYNAMIC_ARCH) if (ARM64) - list(APPEND COMMON_SOURcES dynamic_arm64.c) + list(APPEND COMMON_SOURCES dynamic_arm64.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () From 38cc63859131921885b80ed5139304dc80c5a163 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 21:09:26 +0100 Subject: [PATCH 155/236] Avoid adding blanket march=skylake-avx512 to dynamic_arch builds --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f2647fb7d..dbee28079 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,6 +9,7 @@ endif endif ifeq ($(CORE), SKYLAKEX) +ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 @@ -22,6 +23,7 @@ endif endif endif endif +endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 From 06f7d78d70b95f936765312b8c8b3cadf7265ae5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 21:10:38 +0100 Subject: [PATCH 156/236] Add -march=skylake-avx512 to SkylakeX part of DYNAMIC_ARCH builds --- kernel/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index 923ffc363..6e178f80b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,7 +6,11 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system ifdef TARGET_CORE +ifeq ($(TARGET_CORE), SKYLAKEX) +override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 +else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) +endif BUILD_KERNEL = 1 KDIR = TSUFFIX = _$(TARGET_CORE) From 51aec8e96b78f93f9a6dcbbf1edd212c5f1ab2ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 22:47:32 +0100 Subject: [PATCH 157/236] make sure the added march=skylake-avx512 does not cause problems on Windows --- kernel/Makefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 6e178f80b..a441bde7c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,15 @@ include $(TOPDIR)/Makefile.system ifdef TARGET_CORE ifeq ($(TARGET_CORE), SKYLAKEX) -override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From cdc668d82b7afd6a2ddee33987ecfebcaccebc2d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 12 Dec 2018 16:45:57 +0000 Subject: [PATCH 158/236] Add a "sgemm direct" mode for small matrixes OpenBLAS has a fancy algorithm for copying the input data while laying it out in a more CPU friendly memory layout. This is great for large matrixes; the cost of the copy is easily ammortized by the gains from the better memory layout. But for small matrixes (on CPUs that can do efficient unaligned loads) this copy can be a net loss. This patch adds (for SKYLAKEX initially) a "sgemm direct" mode, that bypasses the whole copy machinary for ALPHA=1/BETA=0/... standard arguments, for small matrixes only. What is small? For the non-threaded case this has been measured to be in the M*N*K = 28 * 512 * 512 range, while in the threaded case it's less, around M*N*K = 1 * 512 * 512 --- common_level3.h | 8 + interface/gemm.c | 8 + kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 467 ++++++++++++++++++++- param.h | 1 + 4 files changed, 483 insertions(+), 1 deletion(-) diff --git a/common_level3.h b/common_level3.h index 1f5490baa..6fa902be8 100644 --- a/common_level3.h +++ b/common_level3.h @@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); extern "C" { #endif +extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float * R, BLASLONG strideR); + +extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); + + int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, diff --git a/interface/gemm.c b/interface/gemm.c index a3bac5984..97e71bc85 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -271,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; +#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { + sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); + return; + } + +#endif + #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 10d3d22ed..3246e681f 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -760,7 +760,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************************/ int __attribute__ ((noinline)) -CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc) { unsigned long M = m, N = n, K = k; if (M == 0) @@ -1175,3 +1175,468 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; } + + +/* + * "Direct sgemm" code. This code operates directly on the inputs and outputs + * of the sgemm call, avoiding the copies, memory realignments and threading, + * and only supports alpha = 1 and beta = 0. + * This is a common case and provides value for relatively small matrixes. + * For larger matrixes the "regular" sgemm code is superior, there the cost of + * copying/shuffling the B matrix really pays off. + */ + + + +#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)]) +#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M) + + +#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps() +#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)]) +#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M) + +#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps() +#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)]) +#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M) + +#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0; +#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)]; +#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N]; +#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; +#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; + +int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) +{ + int mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + + +void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{ + int i, j, k; + + int m4 = M & ~3; + int m2 = M & ~1; + + int n64 = N & ~63; + int n32 = N & ~31; + int n16 = N & ~15; + int n8 = N & ~7; + int n4 = N & ~3; + int n2 = N & ~1; + + i = 0; + + for (i = 0; i < m4; i+=4) { + + for (j = 0; j < n64; j+= 64) { + k = 0; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + DECLARE_RESULT_256(0, 2); + DECLARE_RESULT_256(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + BROADCAST_LOAD_A_256(x, 2); + BROADCAST_LOAD_A_256(x, 3); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + MATMUL_256(0, 2); + MATMUL_256(0, 3); + } + STORE_256(0, 0); + STORE_256(0, 1); + STORE_256(0, 2); + STORE_256(0, 3); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + DECLARE_RESULT_128(0, 2); + DECLARE_RESULT_128(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + BROADCAST_LOAD_A_128(x, 2); + BROADCAST_LOAD_A_128(x, 3); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + MATMUL_128(0, 2); + MATMUL_128(0, 3); + } + STORE_128(0, 0); + STORE_128(0, 1); + STORE_128(0, 2); + STORE_128(0, 3); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2); + DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + BROADCAST_LOAD_A_SCALAR(x, 2); + BROADCAST_LOAD_A_SCALAR(x, 3); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2); + MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + STORE_SCALAR(0, 2); STORE_SCALAR(1, 2); + STORE_SCALAR(0, 3); STORE_SCALAR(1, 3); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0) + DECLARE_RESULT_SCALAR(0, 1) + DECLARE_RESULT_SCALAR(0, 2) + DECLARE_RESULT_SCALAR(0, 3) + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + BROADCAST_LOAD_A_SCALAR(0, 2); + BROADCAST_LOAD_A_SCALAR(0, 3); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + MATMUL_SCALAR(0, 2); + MATMUL_SCALAR(0, 3); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + STORE_SCALAR(0, 2); + STORE_SCALAR(0, 3); + } + } + + for (; i < m2; i+=2) { + j = 0; + + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + } + STORE_256(0, 0); + STORE_256(0, 1); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + } + STORE_128(0, 0); + STORE_128(0, 1); + } + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + DECLARE_RESULT_SCALAR(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + } + } + + for (; i < M; i+=1) { + j = 0; + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + LOAD_B_256(0, x); + MATMUL_256(0, 0); + } + STORE_256(0, 0); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + LOAD_B_128(0, x); + MATMUL_128(0, 0); + } + STORE_128(0, 0); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0); + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + LOAD_B_SCALAR(0, 0); + MATMUL_SCALAR(0, 0); + } + STORE_SCALAR(0, 0); + } + } +} \ No newline at end of file diff --git a/param.h b/param.h index 8f56cdaaa..7a18d82d7 100644 --- a/param.h +++ b/param.h @@ -1628,6 +1628,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SWITCH_RATIO 32 #define GEMM_PREFERED_SIZE 32 +#define USE_SGEMM_KERNEL_DIRECT 1 #ifdef ARCH_X86 From 00dc09ad198aedec53fd05ea1b13d72d7a9a517a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:18:59 +0000 Subject: [PATCH 159/236] Use the skylake sgemm beta code also for haswell with a few small changes it's possible to use the skylake sgemm code also for haswell, this gives a modest gain (10% range) for smallish matrixes but does wonders for very skinny matrixes --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/sgemm_beta_skylakex.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 848de38df..2aec60064 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -33,6 +33,7 @@ ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S +SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 498c46f0d..e8653112c 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,11 +61,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512 z_zero; - __m256 y_zero; +#ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); +#endif + __m256 y_zero = _mm256_setzero_ps(); - z_zero = _mm512_setzero_ps(); - y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -74,8 +74,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; while (i >= 32) { +#ifdef __AVX512CD__ _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); +#else + _mm256_storeu_ps(c_offset1, y_zero); + _mm256_storeu_ps(c_offset1 + 8, y_zero); + _mm256_storeu_ps(c_offset1 + 16, y_zero); + _mm256_storeu_ps(c_offset1 + 24, y_zero); +#endif c_offset1 += 32; i -= 32; } From 0586899a10b97bf1baf50e4988d18b4268317420 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:43:07 +0000 Subject: [PATCH 160/236] Use sgemm_ncopy_4_skylakex.c also for Haswell sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the real perf win happens; this also works great for Haswell. This gives double digit percentage gains on small and skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/sgemm_ncopy_4_skylakex.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 2aec60064..422e6c315 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -36,7 +36,7 @@ SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c index 8577e3b38..6b2b0f5b1 100644 --- a/kernel/x86_64/sgemm_ncopy_4_skylakex.c +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp9, ctemp13; a_offset = a; b_offset = b; From 1ebe5c0f499575d42e85b4f89e4205882be8ebe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 19:35:35 +0100 Subject: [PATCH 161/236] Add -march=haswell to HASWELL part of DYNAMIC_ARCH build --- kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index a441bde7c..d86411d91 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif +elseifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 2a3190dc76a3eb60fabe298b1df04c46cdca5350 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:17:44 +0100 Subject: [PATCH 162/236] fix elseifeq and use older option core2-avx for compatibility --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index d86411d91..169c7f79c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,8 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -elseifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell +else ifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From fbcb14a74bb252ea344f5b10d3d741268326906f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:18:59 +0100 Subject: [PATCH 163/236] should be core-avx2 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 169c7f79c..a9208619f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 3843e3e01781970690325542fe15a722f87407c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 23:30:31 +0100 Subject: [PATCH 164/236] use -maxv2 on haswell --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index a9208619f..b01893175 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 69d206440ab669794201d65d4e8087060e519474 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 00:19:41 +0000 Subject: [PATCH 165/236] Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support --- kernel/x86_64/sgemm_beta_skylakex.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index e8653112c..cdc9c44be 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,10 +61,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ -#ifdef __AVX512CD__ - __m512 z_zero = _mm512_setzero_ps(); -#endif - __m256 y_zero = _mm256_setzero_ps(); j = n; do { @@ -72,12 +68,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset += ldc; i = m; - +#ifdef __AVX2__ while (i >= 32) { #ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); #else + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); _mm256_storeu_ps(c_offset1 + 8, y_zero); _mm256_storeu_ps(c_offset1 + 16, y_zero); @@ -87,11 +85,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i -= 32; } while (i >= 8) { + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From 545c2b1bbbbe9a1c548150189e54fc76e62e4b13 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 13:09:19 +0100 Subject: [PATCH 166/236] Add -mavx2 on Haswell only if the compiler supports it --- kernel/Makefile | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index b01893175..17bfd4063 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,27 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +AVX2OPT = +ifeq ($(C_COMPILER), GCC) +# AVX2 support was added in 4.7.0 + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) + AVX2OPT = -mavx2 + endif +endif +ifeq ($(C_COMPILER), CLANG) +# Any clang posing as gcc 4.2 should be new enough (3.4 or later) + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) + AVX2OPT -mavx2 + endif +endif +ifdef NO_AVX2 + AVX2OPT= +endif + ifdef TARGET_CORE ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 @@ -17,9 +38,9 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else -override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif BUILD_KERNEL = 1 KDIR = From cfc4acc221344d53d72550d157c5050ddaa26ed7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 16:19:51 +0100 Subject: [PATCH 167/236] typo --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 17bfd4063..30292cd80 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -19,7 +19,7 @@ ifeq ($(C_COMPILER), CLANG) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) - AVX2OPT -mavx2 + AVX2OPT = -mavx2 endif endif ifdef NO_AVX2 From c4e23dd016ed2852ebf59a0d744deb55a48e66c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 18:14:40 +0100 Subject: [PATCH 168/236] Update Makefile --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 30292cd80..e81225075 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -37,7 +37,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -else ifeq($(TARGET_CORE), HASWELL) +else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From c43331ad0aeaefe4b4d90aab06c93655c851feab Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 22:59:02 +0000 Subject: [PATCH 169/236] dgemm: Use the skylakex beta function also for haswell it's more efficient for certain tall/skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/dgemm_beta_skylakex.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 422e6c315..4cd67a705 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -45,6 +45,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S +DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 6a824c9b5..8c24725a1 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -61,17 +61,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512d z_zero; - z_zero = _mm512_setzero_pd(); j = n; do { c_offset1 = c_offset; c_offset += ldc; i = m; - +#ifdef __AVX2__ +#ifdef __AVX512CD__ while (i >= 32) { + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -79,12 +79,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } +#endif while (i >= 8) { +#ifdef __AVX512CD__ + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); +#else + __m256d y_zero = _mm256_setzero_pd(); + _mm256_storeu_pd(c_offset1, y_zero); + _mm256_storeu_pd(c_offset1 + 4, y_zero); +#endif c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From d321448a63954d536f90592cd0cc53c304b08d2e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:06:58 +0000 Subject: [PATCH 170/236] dgemm: use dgemm_ncopy_8_skylakex.c also for Haswell The dgemm_ncopy_8_skylakex.c code is not avx512 specific and gives a nice performance boost for medium sized matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 4cd67a705..f98728a41 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -48,7 +48,7 @@ DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) From b28f75cd7e61cf5bdcf404ebece07f75553ecde0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:08:31 +0000 Subject: [PATCH 171/236] set GEMM_PREFERED_SIZE for HASWELL Haswell likes a GEMM_PREFERED_SIZE of 16 to improve the split that the threading code does to make it a nice multiple of the SIMD kernel size --- param.h | 1 + 1 file changed, 1 insertion(+) diff --git a/param.h b/param.h index 7a18d82d7..fa6730208 100644 --- a/param.h +++ b/param.h @@ -1508,6 +1508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 16 #ifdef ARCH_X86 From f343ed65b59b04d9757bf10fcc9fec938d9895a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Dec 2018 22:30:29 +0100 Subject: [PATCH 172/236] Avoid taking the root of a negative number Fixes #1924 where numpy 1.17+ would report the (transient) FE_INVALID exception raised for the domain error. --- driver/level3/syrk_thread.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 5f40853dc..b26d363c4 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( BLASLONG width, i; BLASLONG n_from, n_to; - double dnum, nf, nt, di; + double dnum, nf, nt, di, dinum; int num_cpu; int mask = 0; @@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)i; - width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); + dinum = di * di +dnum; + if (dinum <0) + width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1); + else + width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; @@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( nf = (double)(arg -> n - n_from); nt = (double)(arg -> n - n_to); - dnum = (nt * nt - nf * nf) / (double)nthreads; - num_cpu = 0; range[0] = n_from; @@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)(arg -> n - i); - width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); - + dinum = di * di + dnum; + if (dinum<0) + width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1); + else + width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; } else { From 26a3402773050c8fb3c0e633e967fc1a6456fe0b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 12:26:01 +0100 Subject: [PATCH 173/236] Reflect ARMV8 target definition changes from PR1876 and create config target directory for cross-compiles. --- cmake/prebuild.cmake | 116 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 5 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index f29bc3a75..6ed99e807 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -116,18 +116,19 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define L2_LINESIZE\t64\n" "#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_SIZE\t4096\n" - "#define L2_ASSOCIATIVE\t32\n") + "#define L2_ASSOCIATIVE\t32\n" + "#define ARMV8\n") set(SGEMM_UNROLL_M 4) set(SGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA57") + elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") file(APPEND ${TARGET_CONF_TEMP} - "#define L1_CODE_SIZE\t49152\n" + "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" "#define L1_CODE_ASSOCIATIVE\t3\n" "#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_LINESIZE\t64\n" "#define L1_DATA_ASSOCIATIVE\t2\n" - "#define L2_SIZE\t2097152\n" + "#define L2_SIZE\t262144\n" "#define L2_LINESIZE\t64\n" "#define L2_ASSOCIATIVE\t16\n" "#define DTB_DEFAULT_ENTRIES\t64\n" @@ -135,7 +136,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define HAVE_VFPV4\n" "#define HAVE_VFPV3\n" "#define HAVE_VFP\n" - "#define HAVE_NEON\n") + "#define HAVE_NEON\n" + "#define ARMV8\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 8) @@ -144,6 +146,109 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 4) + elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t49152\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t3\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 4) + elseif ("${CORE}" STREQUAL "FALKOR") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t3\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t128\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 4) + elseif ("${CORE}" STREQUAL "THUNDERX) + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t3\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t128\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t167772164\n" + "#define L2_LINESIZE\t128\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + elseif ("${CORE}" STREQUAL "THUNDERX2T99) + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t262144\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t33554432\n" + "#define L3_LINESIZE\t64\n" + "#define L3_ASSOCIATIVE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define VULCAN\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) endif() # Or should this actually be NUM_CORES? @@ -163,6 +268,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS file(APPEND ${TARGET_CONF_TEMP} "#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n") # Move to where gen_config_h would place it + file(MAKE_DIRECTORY ${TARGET_CONF_DIR}) file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}") else(NOT CMAKE_CROSSCOMPILING) From 43c2b0eb5594bbcb0c48882965a6d655b0f99bc5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 17:16:43 +0100 Subject: [PATCH 174/236] Add -mavx2 to TARGET=HASWELL builds to leverage improvements from PR#1921 --- Makefile.x86_64 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index dbee28079..1b7fe3ef4 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -25,6 +25,17 @@ endif endif endif +ifeq ($(CORE), HASWELL) +ifndef DYNAMIC_ARCH +ifndef NO_AVX2 +CCOMMON_OPT += -mavx2 +FCOMMON_OPT += -mavx2 +endif +endif +endif + + + ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 endif From 49e0f485dac263e3b26cff01ed1759e46880e497 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 17:26:09 +0100 Subject: [PATCH 175/236] Add -mavx2 for TARGET=HASWELL if compiler supports and requires it --- cmake/system.cmake | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index d803bb9eb..ba2c4f351 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -45,6 +45,12 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() +if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() +endif() endif() if (DEFINED TARGET) From 76b4b8980f7cec3ad0dde05d3c0ef2f395d04622 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 19:08:19 +0100 Subject: [PATCH 176/236] Use -dumpversion with gcc only --- cmake/system.cmake | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index ba2c4f351..a060d98cb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -42,15 +42,19 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () if (DEFINED TARGET) -if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") -endif() -if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") - endif() -endif() + if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() endif() if (DEFINED TARGET) From 5bd21ab6e1e4da023185c1472877d9806b1d0c48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 23:46:48 +0100 Subject: [PATCH 177/236] Make sure that -fPIC is present when needed override user-provided FFLAGS if necessary --- Makefile.system | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 3987460ec..fb8e7ea41 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1154,8 +1154,6 @@ ifndef FCOMMON_OPT FCOMMON_OPT = -O2 -frecursive endif - - override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) @@ -1163,6 +1161,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = +ifdef NEED_PIC +ifeq (,$(findstring PIC,$(FFLAGS))) +override FFLAGS += -fPIC +endif +endif + #For LAPACK Fortran codes. #Disable -fopenmp for LAPACK Fortran codes on Windows. ifdef OS_WINDOWS From d6818777d1ed7ead02c0d0b448b2d60e783c97f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 23:47:37 +0100 Subject: [PATCH 178/236] Make sure that -fPIC is present if needed --- exports/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 3a5f77db3..5628eacac 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -2,6 +2,12 @@ TOPDIR = .. include ../Makefile.system +ifdef NEED_PIC +ifeq (,$(findstring PIC,$(CFLAGS))) +CFLAGS+= -fPIC +endif +endif + ifndef EXPRECISION EXPRECISION = 0 endif From 795285c587d40c004910ad8cde72abacfe8f5e2a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 24 Dec 2018 18:49:50 +0000 Subject: [PATCH 179/236] Fix thinko in skylake beta handling casting ints is cheaper but it has a rounding, not memory casing effect, resulting in invalid outcome --- kernel/x86_64/dgemm_beta_skylakex.c | 2 +- kernel/x86_64/sgemm_beta_skylakex.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 8c24725a1..5cd001920 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, FLOAT ctemp5, ctemp6, ctemp7, ctemp8; /* fast path.. just zero the whole matrix */ - if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + if (m == ldc && beta == ZERO) { memset(c, 0, m * n * sizeof(FLOAT)); return 0; } diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index cdc9c44be..1c29c1168 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, FLOAT ctemp5, ctemp6, ctemp7, ctemp8; /* fast path.. just zero the whole matrix */ - if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + if (m == ldc && beta == ZERO) { memset(c, 0, m * n * sizeof(FLOAT)); return 0; } From fe02ba86a46699f5bba3a403bbb1e513273bdd53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Dec 2018 20:46:04 +0100 Subject: [PATCH 180/236] Remove unnecessary change again --- exports/Makefile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 5628eacac..3a5f77db3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -2,12 +2,6 @@ TOPDIR = .. include ../Makefile.system -ifdef NEED_PIC -ifeq (,$(findstring PIC,$(CFLAGS))) -CFLAGS+= -fPIC -endif -endif - ifndef EXPRECISION EXPRECISION = 0 endif From 211120c50832f8f338872c891a51b86e291f13b9 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Thu, 27 Dec 2018 23:09:21 +0100 Subject: [PATCH 181/236] Fix typo in UNKNOWN core name Should be of no consequence, right? --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 8e4a7cb84..eb986b6b6 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1649,7 +1649,7 @@ static char *lowercpuname[] = { }; static char *corename[] = { - "UNKOWN", + "UNKNOWN", "80486", "P5", "P6", From 09170268a31a2113c1203e44da54f3129ca572cf Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:33:18 +0100 Subject: [PATCH 182/236] Update cpuid_arm.c --- cpuid_arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_arm.c b/cpuid_arm.c index 2f8959242..19aa90718 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -34,7 +34,7 @@ #define CPU_CORTEXA15 4 static char *cpuname[] = { - "UNKOWN", + "UNKNOWN", "ARMV6", "ARMV7", "CORTEXA9", From 187233953cadbb876477e511c38e6ac95f44feed Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:34:38 +0100 Subject: [PATCH 183/236] Update cpuid_mips.c --- cpuid_mips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index c09902936..6f2932c94 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -75,7 +75,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_1004K 2 static char *cpuname[] = { - "UNKOWN", + "UNKNOWN", "P5600", "1004K" }; From c329de2931fd524be15aba7c7f04336758552459 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:35:41 +0100 Subject: [PATCH 184/236] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d42f9b8c3..21096f893 100644 --- a/Makefile +++ b/Makefile @@ -131,7 +131,7 @@ endif endif libs : -ifeq ($(CORE), UNKOWN) +ifeq ($(CORE), UNKNOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) endif ifeq ($(NOFORTRAN), 1) From 7cbc2c37d64665d221e6db7537354a09809ff2f3 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:36:39 +0100 Subject: [PATCH 185/236] Update cpuid_mips64.c --- cpuid_mips64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips64.c b/cpuid_mips64.c index dcb559a7c..0e32bfc0b 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_I6500 6 static char *cpuname[] = { - "UNKOWN", + "UNKNOWN", "SICORTEX", "LOONGSON3A", "LOONGSON3B", From 93240f489eaf6352f07366c79e62168583f74b98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Dec 2018 18:12:54 +0100 Subject: [PATCH 186/236] Fix wrong case in TARGET setting for Alpine --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4efa23b8d..3f323a854 100644 --- a/.travis.yml +++ b/.travis.yml @@ -117,7 +117,7 @@ matrix: - <<: *test-alpine env: - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" + - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" - &test-cmake os: linux From bba1e672691cd62a2a0607865a2514334f8700e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Dec 2018 21:59:31 +0100 Subject: [PATCH 187/236] Delete the pthread key on cleanup in TLS mode to avoid a crash when OpenBLAS was loaded via dlopen and libc tries to clean up the leaked TLS after dlclose Fixes #1720 --- driver/others/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 36815a39c..6f7a7db82 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1073,6 +1073,11 @@ static volatile int memory_initialized = 0; } free(table); } +#if defined(OS_WINDOWS) + TlsFree(local_storage_key); +#else + pthread_key_delete(local_storage_key); +#endif } static void blas_memory_init(){ From 9f80e0f5fcfe883b5f355d71831bc22880c40271 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Dec 2018 14:39:18 +0100 Subject: [PATCH 188/236] Remove stray include of complex.h already provided conditionally by common.h via openblas_utest.h Unconditional inclusion breaks older Android and similar platforms that use OPENBLAS_COMPLEX_STRUCT --- utest/test_dotu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/utest/test_dotu.c b/utest/test_dotu.c index ef04dd9a8..918541848 100644 --- a/utest/test_dotu.c +++ b/utest/test_dotu.c @@ -32,7 +32,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" -#include CTEST( zdotu,zdotu_n_1) { From 5a720cf9cac5266079c06032fb2ab36da4ed84f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Dec 2018 15:22:37 +0100 Subject: [PATCH 189/236] Re-enable loop unrolling in trmv and remove the scary warning fixes #1748 as that half of the fix for #1332 appears to have been an overreaction on my part. --- driver/level2/trmv_U.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/driver/level2/trmv_U.c b/driver/level2/trmv_U.c index 7f8895e7f..90ffb7370 100644 --- a/driver/level2/trmv_U.c +++ b/driver/level2/trmv_U.c @@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu COPY_K(m, b, incb, buffer, 1); } -/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */ -/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */ + for (is = 0; is < m; is += DTB_ENTRIES){ - for (is = 0; is < m; is += DTB_ENTRIES * 100){ - - min_i = MIN(m - is, DTB_ENTRIES * 100); + min_i = MIN(m - is, DTB_ENTRIES); #ifndef TRANSA - if (is > 0){ -fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n"); + if (is > 0){ GEMV_N(is, min_i, 0, dp1, a + is * lda, lda, B + is, 1, From 0d52aefc6b462db2fcdb9ff800d11b7ba8a4f7ab Mon Sep 17 00:00:00 2001 From: George Hartzell Date: Sun, 30 Dec 2018 14:55:34 -0800 Subject: [PATCH 190/236] Typo: Skyalke -> Skylake Worth fixing, it gets in the way of searching.... --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9ed9be337..26055c745 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. -* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. +* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. From 13d006339b2082ec871b839b73349a2f4645bf83 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:00:46 +0100 Subject: [PATCH 191/236] Update ChangeLog.txt with changes from 0.3.5 --- Changelog.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 0dd17a558..49b26873a 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,36 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.5 +31-Dec-2018 + +common: + * loop unrolling in TRMV has been enabled again. + * A domain error in the thread workload distribution for SYRK + has been fixed. + * gmake builds will now automatically add -fPIC to the build + options if the platform requires it. + * a pthreads key leakage (and associate crash on dlclose) in + the USE_TLS codepath was fixed. + * building of the utest cases on systems that do not provide + an implementation of complex.h was fixed. + +x86_64: + * the SkylakeX code was changed to compile on OSX. + * unwanted application of the -march=skylake-avx512 option + to the common code parts of a DYNAMIC_ARCH build was fixed. + * improved performance of SGEMM for small workloads on Skylake X. + * performance of SGEMM and DGEMM was improved on Haswell. + +ARMV8: + * a configuration error that broke the CNRM2 kernel was corrected. + * compilation of the GEMM kernels with CMAKE was fixed. + * DYNAMIC_ARCH builds are now available with CMAKE as well. + * using CMAKE for cross-compilation to the new cpu TARGETs + introduced in 0.3.4 now works. + +POWER: + * a problem in cpu autodetection for AIX has been corrected. + ==================================================================== Version 0.3.4 02-Dec-2018 From 2940798ea7efb799d682739e3e5d00985b3efd3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:10:59 +0100 Subject: [PATCH 192/236] Increment version to 0.3.6.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 24c169afe..812e6bf6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 5.dev) +set(OpenBLAS_PATCH_VERSION 6.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From ed704185abd09fe04c6c82cf809c1cb09d359651 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:11:37 +0100 Subject: [PATCH 193/236] Increment version to 0.3.6.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0d5b83b39..7c128fb49 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.5.dev +VERSION = 0.3.6.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d11554c88fdf1b6a9cad1c4c1252f27995117378 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Mon, 31 Dec 2018 23:19:44 +0100 Subject: [PATCH 194/236] Validate user supplied TARGET (#1941) the build will now abort with an error message when an undefined build TARGET is named Fixes #1938 --- Makefile.system | 1 + getarch.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/Makefile.system b/Makefile.system index fb8e7ea41..20d4f6492 100644 --- a/Makefile.system +++ b/Makefile.system @@ -65,6 +65,7 @@ endif ifdef TARGET GETARCH_FLAGS := -DFORCE_$(TARGET) +GETARCH_FLAGS += -DUSER_TARGET endif # Force fallbacks for 32bit diff --git a/getarch.c b/getarch.c index 146f1f36f..78ba0fefd 100644 --- a/getarch.c +++ b/getarch.c @@ -1068,6 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef FORCE +#ifdef USER_TARGET +#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt" +#endif + #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) #ifndef POWER From 20d1aad13f59d6146bcdf8be6716cd8cc020d2bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Jan 2019 20:15:35 +0100 Subject: [PATCH 195/236] Fix missing quotes around thunderx targets --- cmake/prebuild.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6ed99e807..757461008 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -198,7 +198,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "THUNDERX) + elseif ("${CORE}" STREQUAL "THUNDERX") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -224,7 +224,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 2) - elseif ("${CORE}" STREQUAL "THUNDERX2T99) + elseif ("${CORE}" STREQUAL "THUNDERX2T99") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" From 802f0dbde153b166f533ab1660336d7832e5b616 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Jan 2019 22:17:31 +0100 Subject: [PATCH 196/236] More fixes for cross-compiling ARM64 targets Fixed core naming for DYNAMIC_ARCH. Corrected GEMM_DEFAULT entries and added SYMV_P. Replaced outdated VULCAN define for ThunderX2T99 with ARMV8 to get basic definitions back. For issue #1908 --- cmake/prebuild.cmake | 45 ++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 757461008..a67c44bf5 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -87,13 +87,18 @@ endif () # Cannot run getarch on target if we are cross-compiling if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) # Write to config as getarch would + if (DEFINED TARGET_CORE) + set(TCORE ${TARGET_CORE}) + else() + set(TCORE ${CORE}) + endif() # TODO: Set up defines that getarch sets up based on every other target # Perhaps this should be inside a different file as it grows larger file(APPEND ${TARGET_CONF_TEMP} - "#define ${CORE}\n" - "#define CHAR_CORENAME \"${CORE}\"\n") - if ("${CORE}" STREQUAL "ARMV7") + "#define ${TCORE}\n" + "#define CHAR_CORENAME \"${TCORE}\"\n") + if ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_LINESIZE\t32\n" @@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "ARMV8") + elseif ("${TCORE}" STREQUAL "ARMV8") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_LINESIZE\t64\n" @@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define DTB_SIZE\t4096\n" "#define L2_ASSOCIATIVE\t32\n" "#define ARMV8\n") - set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t49152\n" "#define L1_CODE_LINESIZE\t64\n" @@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "FALKOR") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "FALKOR") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" @@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "THUNDERX") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 2) - elseif ("${CORE}" STREQUAL "THUNDERX2T99") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX2T99") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define L3_ASSOCIATIVE\t32\n" "#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_SIZE\t4096\n" - "#define VULCAN\n") + "#define ARMV8\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 8) @@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) endif() # Or should this actually be NUM_CORES? From ae1d1f74f7ff96b8345189bcba058b7acdc7d494 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:55:33 +0100 Subject: [PATCH 197/236] Query AVX2 and AVX512 capability for runtime cpu selection --- driver/others/dynamic.c | 141 +++++++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1f67dc521..7cc911d32 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -304,9 +304,47 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 1){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" +#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -403,18 +441,24 @@ static gotoblas_t *get_coretype(void){ } //Intel Haswell if (model == 12 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 13) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -424,27 +468,36 @@ static gotoblas_t *get_coretype(void){ case 4: //Intel Haswell if (model == 5 || model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 7 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -457,40 +510,50 @@ static gotoblas_t *get_coretype(void){ case 5: //Intel Broadwell if (model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } if (model == 5) { // Intel Skylake X -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) return &gotoblas_HASWELL; - else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } -#endif + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Phi Knights Landing if (model == 7) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -503,26 +566,26 @@ static gotoblas_t *get_coretype(void){ case 6: if (model == 6) { // Cannon Lake -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return &gotoblas_HASWELL; -#else - return &gotoblas_SANDYBRIDGE; -#endif - else - return &gotoblas_NEHALEM; -#endif + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } From 0afaae4b2323b28af49ffe81b98d17bd4ced96f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:58:56 +0100 Subject: [PATCH 198/236] Query AVX2 and AVX512VL capability in x86 cpu detection --- common_x86_64.h | 2 +- cpuid.h | 1 + cpuid_x86.c | 132 +++++++++++++++++++++++++++--------------------- 3 files changed, 76 insertions(+), 59 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 62e138e34..f27c1e9be 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op)); + : "0" (op), "c"(0)); #endif } diff --git a/cpuid.h b/cpuid.h index a6bc211f3..c56672ad8 100644 --- a/cpuid.h +++ b/cpuid.h @@ -139,6 +139,7 @@ #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) #define HAVE_AVX512VL (1 << 21) +#define HAVE_AVX2 (1 << 22) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 diff --git a/cpuid_x86.c b/cpuid_x86.c index eb986b6b6..ddc09857b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ ("mov %%ebx, %%edi;" "cpuid;" "xchgl %%ebx, %%edi;" - : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); #else __asm__ __volatile__ - ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); #endif } @@ -211,6 +211,42 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & 32) != 32){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + int get_vendor(void){ int eax, ebx, ecx, edx; @@ -294,6 +330,8 @@ int get_cputype(int gettype){ if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; #ifndef NO_AVX if (support_avx()) feature |= HAVE_AVX; + if (support_avx2()) feature |= HAVE_AVX2; + if (support_avx512()) feature |= HAVE_AVX512VL; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1228,22 +1266,18 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: case 15: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 13: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -1252,33 +1286,27 @@ int get_cpuname(void){ switch (model) { case 5: case 6: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: case 15: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 14: //Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1292,46 +1320,36 @@ int get_cpuname(void){ switch (model) { case 6: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 5: // Skylake X -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif case 14: // Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: // Xeon Phi Knights Landing - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1342,30 +1360,24 @@ int get_cpuname(void){ case 6: switch (model) { case 6: // Cannon Lake -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif } break; case 9: case 8: switch (model) { case 14: // Kaby Lake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -2112,6 +2124,8 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); + if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); + if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2180,6 +2194,8 @@ void get_sse(void){ if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); + if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); + if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); From 68eb3146ce4c50ac557cf5f199cc1b4294ba3817 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:07:14 +0100 Subject: [PATCH 199/236] Add xcr0 (os support) check --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index ddc09857b..377267fcc 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -239,6 +239,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From e1574fa2b4a2a781be70d8d521bb3b80a572ca9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:08:02 +0100 Subject: [PATCH 200/236] Add xcr0 (os support) check --- driver/others/dynamic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7cc911d32..4c966260d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,6 +332,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From 31ed19e8b907f72ed4c8ef3165d8577b55264861 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 19:41:13 +0100 Subject: [PATCH 201/236] Add message for SkylakeX and KNL fallbacks to Haswell --- driver/others/dynamic.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4c966260d..ba93fca8b 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -346,7 +346,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" -#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -526,8 +526,10 @@ static gotoblas_t *get_coretype(void){ // Intel Skylake X if (support_avx512()) return &gotoblas_SKYLAKEX; - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; @@ -550,8 +552,10 @@ static gotoblas_t *get_coretype(void){ } //Intel Phi Knights Landing if (model == 7) { - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; From 191677b902054d1476f3bb12b5360c337c47eb7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 10:46:47 +0100 Subject: [PATCH 202/236] Add travis_wait to the OSX brew install phase --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3f323a854..e8b7e0a27 100644 --- a/.travis.yml +++ b/.travis.yml @@ -153,7 +153,7 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - brew install gcc # for gfortran + - travis_wait 30 brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: From cf5d48e83300a5eb2bb047829fc793ba78959c35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 14:41:48 +0100 Subject: [PATCH 203/236] Update OSX environment to Sierra as homebrew seems to have dropped support for El Capitan in their gcc packages --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3f323a854..51679af62 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,7 +149,7 @@ matrix: - &test-macos os: osx - osx_image: xcode8 + osx_image: xcode8.3 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update From 1650311246d185ca2631c76c33c0212848b57d2a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 14:43:45 +0100 Subject: [PATCH 204/236] Bump xcode to 8.3 --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e8b7e0a27..51679af62 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,11 +149,11 @@ matrix: - &test-macos os: osx - osx_image: xcode8 + osx_image: xcode8.3 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - travis_wait 30 brew install gcc # for gfortran + - brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: From ad2c386d6ad99d3021e33cbbfb311150b2586c93 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jan 2019 00:32:50 +0100 Subject: [PATCH 205/236] Move TLS key deletion to openblas_quit fixes #1954 (as suggested by thrasibule in that issue) --- driver/others/memory.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 6f7a7db82..72d3e173c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1073,11 +1073,6 @@ static volatile int memory_initialized = 0; } free(table); } -#if defined(OS_WINDOWS) - TlsFree(local_storage_key); -#else - pthread_key_delete(local_storage_key); -#endif } static void blas_memory_init(){ @@ -1491,6 +1486,14 @@ void DESTRUCTOR gotoblas_quit(void) { blas_shutdown(); +#if defined(SMP) +#if defined(OS_WINDOWS) + TlsFree(local_storage_key); +#else + pthread_key_delete(local_storage_key); +#endif +#endif + #ifdef PROFILE moncontrol (0); #endif From 00401489c2d82e1dd997f91480fe6bc441cd6b40 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Jan 2019 22:38:32 +0100 Subject: [PATCH 206/236] Fix missing braces in support_avx() --- cpuid_x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 377267fcc..74cc6655b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -216,7 +216,7 @@ int support_avx2(){ int eax, ebx, ecx=0, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 0) @@ -232,7 +232,7 @@ int support_avx512(){ int eax, ebx, ecx, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & 32) != 32){ From dbc9a060ef4d6ba08b21352f22bb2fa989db0919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Jan 2019 22:41:31 +0100 Subject: [PATCH 207/236] Fix missing braces in support_av() call --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ba93fca8b..9e59da2cc 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -309,7 +309,7 @@ int support_avx2(){ int eax, ebx, ecx=0, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 0) @@ -325,7 +325,7 @@ int support_avx512(){ int eax, ebx, ecx, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 1){ From 29dc72889f5c0544aee8bc5f2dee98603cbfec36 Mon Sep 17 00:00:00 2001 From: caiyu Date: Wed, 16 Jan 2019 14:25:19 +0800 Subject: [PATCH 208/236] Add support for Hygon Dhyana --- cpuid.h | 5 ++++ cpuid_x86.c | 54 +++++++++++++++++++++++++++++++++++++---- driver/others/dynamic.c | 11 ++++++++- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/cpuid.h b/cpuid.h index c56672ad8..697f43133 100644 --- a/cpuid.h +++ b/cpuid.h @@ -53,6 +53,7 @@ #define VENDOR_SIS 8 #define VENDOR_TRANSMETA 9 #define VENDOR_NSC 10 +#define VENDOR_HYGON 11 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -116,6 +117,7 @@ #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 #define CORE_SKYLAKEX 28 +#define CORE_DHYANA 29 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -215,5 +217,8 @@ typedef struct { #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 #define CPUTYPE_SKYLAKEX 52 +#define CPUTYPE_DHYANA 53 + +#define CPUTYPE_HYGON_UNKNOWN 54 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 74cc6655b..726014033 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -271,6 +271,7 @@ int get_vendor(void){ if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; + if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -1046,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ } } - if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { + if ((get_vendor() == VENDOR_AMD) || + (get_vendor() == VENDOR_HYGON) || + (get_vendor() == VENDOR_CENTAUR)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); LDTB.size = 4096; @@ -1483,6 +1486,26 @@ int get_cpuname(void){ return CPUTYPE_AMD_UNKNOWN; } + if (vendor == VENDOR_HYGON){ + switch (family) { + case 0xf: + switch (exfamily) { + case 9: + //Hygon Dhyana + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_ZEN; +#else + return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CPUTYPE_BARCELONA; + } + break; + } + return CPUTYPE_HYGON_UNKNOWN; + } + if (vendor == VENDOR_CYRIX){ switch (family) { case 0x4: @@ -1604,7 +1627,8 @@ static char *cpuname[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", - "SKYLAKEX" + "SKYLAKEX", + "DHYANA" }; static char *lowercpuname[] = { @@ -1659,7 +1683,8 @@ static char *lowercpuname[] = { "steamroller", "excavator", "zen", - "skylakex" + "skylakex", + "dhyana" }; static char *corename[] = { @@ -1691,7 +1716,8 @@ static char *corename[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", - "SKYLAKEX" + "SKYLAKEX", + "DHYANA" }; static char *corename_lower[] = { @@ -1723,7 +1749,8 @@ static char *corename_lower[] = { "steamroller", "excavator", "zen", - "skylakex" + "skylakex", + "dhyana" }; @@ -2040,6 +2067,23 @@ int get_coretype(void){ } } + if (vendor == VENDOR_HYGON){ + if (family == 0xf){ + if (exfamily == 9) { + if(support_avx()) +#ifndef NO_AVX2 + return CORE_ZEN; +#else + return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CORE_BARCELONA; + } else { + return CORE_BARCELONA; + } + } + } + if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9e59da2cc..99c9254ac 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define VENDOR_INTEL 1 #define VENDOR_AMD 2 #define VENDOR_CENTAUR 3 +#define VENDOR_HYGON 4 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -369,6 +370,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -604,7 +606,7 @@ static gotoblas_t *get_coretype(void){ } } - if (vendor == VENDOR_AMD){ + if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ if (family <= 0xe) { // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon cpuid(0x80000000, &eax, &ebx, &ecx, &edx); @@ -684,6 +686,13 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } + } else if (exfamily == 9) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } }else { return &gotoblas_BARCELONA; } From def0385caaa054411676032bddafe1aee903f656 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 09:51:29 +0200 Subject: [PATCH 209/236] init From b70fd238366c6a822c7f1766ab125f64c67a6b39 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:18:54 +0200 Subject: [PATCH 210/236] disable NaN checks before BLAS calls dsolve.R --- benchmark/scripts/R/dsolve.R | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index a3fb78da7..6f1b8ef7b 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,6 +2,10 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) { + options(matprod = "blas") +} + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +23,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,31 +30,23 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) - B <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) + B <- matrix(rnorm(n * n), nrow = n) z <- system.time(for (l in 1:loops) { solve(A, B) }) - mflops <- - (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } + From 2777a7f506308550e37f7ef26ce05f53a0d096ef Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:23:51 +0200 Subject: [PATCH 211/236] disable NaN checks before BLAS calls dsolve.R (shorter config part) --- benchmark/scripts/R/dsolve.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index 6f1b8ef7b..ad2045900 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,9 +2,7 @@ argv <- commandArgs(trailingOnly = TRUE) -if (!is.null(options("matprod")[[1]])) { - options(matprod = "blas") -} +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") nfrom <- 128 nto <- 2048 @@ -42,11 +40,10 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep } - From 7af8b21dbbb523b0e9ab6caff271cb63affaa5f2 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:34:46 +0200 Subject: [PATCH 212/236] disable NaN checks before BLAS calls dsolve.R (shorter formula) --- benchmark/scripts/R/dsolve.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index ad2045900..46301570b 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -40,7 +40,7 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) + mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) From 3afceb6c2a220ff61878c9a328846cc723de42ed Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:38:14 +0200 Subject: [PATCH 213/236] disable NaN checks before BLAS calls deig.R --- benchmark/scripts/R/deig.R | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index ece727fb3..32716471b 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,14 +28,7 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom @@ -45,11 +39,10 @@ while (n <= nto) { ev <- eigen(A) }) - mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 478d3c4569cd4957bbef779423ee7e51686b5c0a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:41:46 +0200 Subject: [PATCH 214/236] disable NaN checks before BLAS calls deig.R (shorten matrix def) --- benchmark/scripts/R/deig.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index 32716471b..c6d541dcf 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -33,7 +33,7 @@ cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) ev <- 0 z <- system.time(for (l in 1:loops) { ev <- eigen(A) From 3e601bd4195b24568eb4f7db2402ba3258fd82cc Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:54:22 +0200 Subject: [PATCH 215/236] disable NaN checks before BLAS calls dgemm.R --- benchmark/scripts/R/dgemm.R | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R index 75297dfb8..d7c3e8108 100755 --- a/benchmark/scripts/R/dgemm.R +++ b/benchmark/scripts/R/dgemm.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,26 +28,13 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) - B <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) + A <- matrix(runif(n * n), nrow = n) + B <- matrix(runif(n * n), nrow = n) C <- 1 z <- system.time(for (l in 1:loops) { @@ -54,11 +42,10 @@ while (n <= nto) { l <- l + 1 }) - mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 8c3386be8780bdf631ffebe085fde2591d4cd062 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 16 Jan 2019 15:16:21 +0000 Subject: [PATCH 216/236] Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin}, Fixed idamin,icamin choosing the first occurance index of equal minimals --- kernel/power/KERNEL.POWER8 | 20 +-- kernel/power/caxpy.c | 145 +++++++++++++++++++ kernel/power/cdot.c | 164 +++++++++++++++++++++ kernel/power/crot.c | 213 +++++++++++++++++++++++++++ kernel/power/icamax.c | 261 +++++++++++++++++++++++++++++++++ kernel/power/icamin.c | 266 ++++++++++++++++++++++++++++++++++ kernel/power/idamin.c | 50 +++---- kernel/power/isamax.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/isamin.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/izamin.c | 26 ++-- kernel/power/saxpy.c | 129 +++++++++++++++++ 11 files changed, 1802 insertions(+), 48 deletions(-) create mode 100644 kernel/power/caxpy.c create mode 100644 kernel/power/cdot.c create mode 100644 kernel/power/crot.c create mode 100644 kernel/power/icamax.c create mode 100644 kernel/power/icamin.c create mode 100644 kernel/power/isamax.c create mode 100644 kernel/power/isamin.c create mode 100644 kernel/power/saxpy.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 00ff8682a..cbcffb8fe 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -#ISAMAXKERNEL = ../arm/iamax.c +ISAMAXKERNEL = isamax.c IDAMAXKERNEL = idamax.c -#ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = izamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c # -#ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = idamin.c -#ICAMINKERNEL = ../arm/izamin.c +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # -#SAXPYKERNEL = ../arm/axpy.c +SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -#CAXPYKERNEL = ../arm/zaxpy.c +CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c @@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -#CDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = ../arm/zrot.c +CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c new file mode 100644 index 000000000..4bdf13c34 --- /dev/null +++ b/kernel/power/caxpy.c @@ -0,0 +1,145 @@ +/* +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#ifndef HAVE_ASM_KERNEL +#include +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) +{ + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; + register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; + +#else + register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; + register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; +#endif + + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i=0; + for (; i < n/2; i += 8) { + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vy_4 = vy[i + 4]; + register __vector float vy_5 = vy[i + 5]; + register __vector float vy_6 = vy[i + 6]; + register __vector float vy_7 = vy[i + 7]; + register __vector float vx_0 = vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + register __vector float vx_4 = vx[i + 4]; + register __vector float vx_5 = vx[i + 5]; + register __vector float vx_6 = vx[i + 6]; + register __vector float vx_7 = vx[i + 7]; + vy_0 += vx_0*valpha_r; + vy_1 += vx_1*valpha_r; + vy_2 += vx_2*valpha_r; + vy_3 += vx_3*valpha_r; + vy_4 += vx_4*valpha_r; + vy_5 += vx_5*valpha_r; + vy_6 += vx_6*valpha_r; + vy_7 += vx_7*valpha_r; + vx_0 = vec_perm(vx_0, vx_0, swap_mask); + vx_1 = vec_perm(vx_1, vx_1, swap_mask); + vx_2 = vec_perm(vx_2, vx_2, swap_mask); + vx_3 = vec_perm(vx_3, vx_3, swap_mask); + vx_4 = vec_perm(vx_4, vx_4, swap_mask); + vx_5 = vec_perm(vx_5, vx_5, swap_mask); + vx_6 = vec_perm(vx_6, vx_6, swap_mask); + vx_7 = vec_perm(vx_7, vx_7, swap_mask); + vy_0 += vx_0*valpha_i; + vy_1 += vx_1*valpha_i; + vy_2 += vx_2*valpha_i; + vy_3 += vx_3*valpha_i; + vy_4 += vx_4*valpha_i; + vy_5 += vx_5*valpha_i; + vy_6 += vx_6*valpha_i; + vy_7 += vx_7*valpha_i; + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + vy[i + 4] = vy_4; + vy[i + 5] = vy_5 ; + vy[i + 6] = vy_6 ; + vy[i + 7] = vy_7 ; + + } +} +#endif +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + if (n <= 0) return (0); + if ((inc_x == 1) && (inc_y == 1)) { + BLASLONG n1 = n & -16; + if (n1) { + caxpy_kernel_16(n1, x, y, da_r,da_i); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + } + return (0); + + } + inc_x *= 2; + inc_y *= 2; + while (i < n) { +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} + diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c new file mode 100644 index 000000000..f86a33f22 --- /dev/null +++ b/kernel/power/cdot.c @@ -0,0 +1,164 @@ +/*Copyright (c) 2013-201\n8, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#ifndef HAVE_KERNEL_8 +#include +static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) +{ + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i = 0; + register __vector float vd_0 = { 0 }; + register __vector float vd_1 = { 0 }; + register __vector float vd_2 = { 0 }; + register __vector float vd_3 = { 0 }; + register __vector float vdd_0 = { 0 }; + register __vector float vdd_1 = { 0 }; + register __vector float vdd_2 = { 0 }; + register __vector float vdd_3 = { 0 }; + for (; i < n/2; i += 4) { + + register __vector float vyy_0 ; + register __vector float vyy_1 ; + register __vector float vyy_2 ; + register __vector float vyy_3 ; + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vx_0= vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + vyy_0 = vec_perm(vy_0, vy_0, swap_mask); + vyy_1 = vec_perm(vy_1, vy_1, swap_mask); + vyy_2 = vec_perm(vy_2, vy_2, swap_mask); + vyy_3 = vec_perm(vy_3, vy_3, swap_mask); + + vd_0 += vx_0 * vy_0; + vd_1 += vx_1 * vy_1; + vd_2 += vx_2 * vy_2; + vd_3 += vx_3 * vy_3; + + vdd_0 += vx_0 * vyy_0; + vdd_1 += vx_1 * vyy_1; + vdd_2 += vx_2 * vyy_2; + vdd_3 += vx_3 * vyy_3; + + + } + //aggregate + vd_0 = vd_0 + vd_1 +vd_2 +vd_3; + vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; + //reverse and aggregate + vd_1=vec_xxpermdi(vd_0,vd_0,2) ; + vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); + vd_2=vd_0+vd_1; + vdd_2=vdd_0+vdd_1; + + dot[0]=vd_2[0]; + dot[1]=vd_2[1]; + dot[2]=vdd_2[0]; + dot[3]=vdd_2[1]; + +} +#endif + + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix=0, iy=0; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -8; + BLASLONG j=0; + + if (n1){ + cdot_kernel_8(n1, x, y, dot); + i = n1; + j = n1 <<1; + } + + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} diff --git a/kernel/power/crot.c b/kernel/power/crot.c new file mode 100644 index 000000000..7e04a09e8 --- /dev/null +++ b/kernel/power/crot.c @@ -0,0 +1,213 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "ble 2f \n\t" + ".p2align 5 \n\t" + "1: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "bgt 1b \n\t" + "2: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT temp; + if ( n <= 0 ) return(0); + if ( (inc_x == 1) && (inc_y == 1) ) + { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x1, y1, c, s); + i=n1; + } + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + i++ ; + } + + } + else + { + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + return(0); +} + diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c new file mode 100644 index 000000000..aa0531dc6 --- /dev/null +++ b/kernel/power/icamax.c @@ -0,0 +1,261 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c new file mode 100644 index 000000000..36432c993 --- /dev/null +++ b/kernel/power/icamin.c @@ -0,0 +1,266 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index f4d1d1bdb..7fe0f8a33 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -89,10 +89,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { ".p2align 5 \n\t" "1: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -103,8 +103,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -125,7 +125,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -139,7 +139,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -162,10 +162,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //<-----------jump here from first load "2: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -176,8 +176,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" "xxsel 0 ,0,1,2 \n\t" "xxsel 34,34,35,3 \n\t" @@ -194,7 +194,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -210,7 +210,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -238,10 +238,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //============================================================================== - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -252,8 +252,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" @@ -264,14 +264,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" "vaddudm 1,1,5 \n\t" // get real index for first smaller //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -284,7 +284,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c new file mode 100644 index 000000000..bf1af78d6 --- /dev/null +++ b/kernel/power/isamax.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c new file mode 100644 index 000000000..1c1f0ad78 --- /dev/null +++ b/kernel/power/isamin.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 448247ffd..1ffa3ba8b 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -101,8 +101,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -114,7 +114,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -126,7 +126,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -166,8 +166,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -179,7 +179,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -191,7 +191,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -235,15 +235,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "xxsel 32,40,41,50 \n\t" "xxsel 0,46,47,50 \n\t" "xxsel 33,42,43,51 \n\t" "xxsel 1,48,49,51 \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "xxsel 32,32,33,2 \n\t" "xxsel 3,0,1,2 \n\t" @@ -252,7 +252,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -267,7 +267,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c new file mode 100644 index 000000000..393cdfadc --- /dev/null +++ b/kernel/power/saxpy.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + + + +#ifndef HAVE_KERNEL_8 +#include + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG i = 0; + __vector float v_a = {alpha,alpha,alpha,alpha}; + __vector float * v_y=(__vector float *)y; + __vector float * v_x=(__vector float *)x; + + for(; i Date: Thu, 17 Jan 2019 14:45:31 +0000 Subject: [PATCH 217/236] crot fix --- kernel/power/crot.c | 90 +++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 7e04a09e8..40e350ba3 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -55,7 +55,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "lxvd2x 51, %[i48], %[y_ptr] \n\t" "addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t" - "addic. %[temp_n], %[temp_n], -16 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" "ble 2f \n\t" ".p2align 5 \n\t" "1: \n\t" @@ -103,7 +103,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" "addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t" - "addic. %[temp_n], %[temp_n], -16 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" "bgt 1b \n\t" "2: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x @@ -173,41 +173,59 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT *x1=x; - FLOAT *y1=y; - FLOAT temp; - if ( n <= 0 ) return(0); - if ( (inc_x == 1) && (inc_y == 1) ) - { - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - crot_kernel_8(n1, x1, y1, c, s); - i=n1; - } - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - i++ ; - } + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; - } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; - ix += inc_x ; - iy += inc_y ; - i++ ; - } - } + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x, y, c, s); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + } return(0); } From 3e9fd6359dabb1c9c8ce3fa5e980e94a3536d2c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 16:19:03 +0100 Subject: [PATCH 218/236] Bump xcode version to 10.1 to make sure it handles AVX512 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 51679af62..ec5dc8a9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,7 +149,7 @@ matrix: - &test-macos os: osx - osx_image: xcode8.3 + osx_image: xcode10.1 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update From d5e6940253b2ee638509de283b8b1d7695fefbbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 23:20:32 +0100 Subject: [PATCH 219/236] Fix declaration of input arguments in the x86_64 microkernels for DOT and AXPY (#1965) * Tag operands 0 and 1 as both input and output For #1964 (basically a continuation of coding problems first seen in #1292) --- kernel/x86_64/caxpy_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/caxpy_microk_haswell-2.c | 6 +++--- kernel/x86_64/caxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/caxpy_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_haswell-2.c | 6 +++--- kernel/x86_64/cdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/cdot_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/daxpy_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/daxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/daxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/daxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/ddot_microk_haswell-2.c | 6 +++--- kernel/x86_64/ddot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/ddot_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/ddot_microk_steamroller-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/saxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/saxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/sdot_microk_haswell-2.c | 8 ++++---- kernel/x86_64/sdot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/sdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/zaxpy_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_haswell-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_steamroller-2.c | 16 ++++++++-------- 37 files changed, 202 insertions(+), 202 deletions(-) diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 33bda0943..ca2209340 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c index 00e2e6a42..b605ea34c 100644 --- a/kernel/x86_64/caxpy_microk_haswell-2.c +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c index a798fd977..72d37afed 100644 --- a/kernel/x86_64/caxpy_microk_sandy-2.c +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c index 87370b032..7ca7af070 100644 --- a/kernel/x86_64/caxpy_microk_steamroller-2.c +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c index f587aa036..118655913 100644 --- a/kernel/x86_64/cdot_microk_bulldozer-2.c +++ b/kernel/x86_64/cdot_microk_bulldozer-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c index fe195a63b..8b9d6d104 100644 --- a/kernel/x86_64/cdot_microk_haswell-2.c +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c index 01816917d..fe142c38f 100644 --- a/kernel/x86_64/cdot_microk_sandy-2.c +++ b/kernel/x86_64/cdot_microk_sandy-2.c @@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c index 76a3aa0eb..7350b21c9 100644 --- a/kernel/x86_64/cdot_microk_steamroller-2.c +++ b/kernel/x86_64/cdot_microk_steamroller-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c index 8c520dcf1..9c1305b97 100644 --- a/kernel/x86_64/daxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index bbe8b9550..f3682e6d7 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c index 943d893af..8feb9f26c 100644 --- a/kernel/x86_64/daxpy_microk_nehalem-2.c +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c index 95eb953b4..4b83124c7 100644 --- a/kernel/x86_64/daxpy_microk_piledriver-2.c +++ b/kernel/x86_64/daxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c index 85e038cef..db9a45de8 100644 --- a/kernel/x86_64/daxpy_microk_sandy-2.c +++ b/kernel/x86_64/daxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c index e40009037..8e63fcc1d 100644 --- a/kernel/x86_64/daxpy_microk_steamroller-2.c +++ b/kernel/x86_64/daxpy_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c index 9756ee46a..5590c5b17 100644 --- a/kernel/x86_64/ddot_microk_bulldozer-2.c +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index 365737363..dbb5487f7 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c index fb5ec9bca..e5e234e22 100644 --- a/kernel/x86_64/ddot_microk_nehalem-2.c +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c index ac950885c..cc4bcd90a 100644 --- a/kernel/x86_64/ddot_microk_piledriver-2.c +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c index 160f95604..84493ec27 100644 --- a/kernel/x86_64/ddot_microk_sandy-2.c +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c index 5ce20b5de..27d5244ce 100644 --- a/kernel/x86_64/ddot_microk_steamroller-2.c +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 3a743d64c..7099ba4c6 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c index 68f68ea3a..88bbb695d 100644 --- a/kernel/x86_64/saxpy_microk_nehalem-2.c +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c index 204cf8bac..5feea7f24 100644 --- a/kernel/x86_64/saxpy_microk_piledriver-2.c +++ b/kernel/x86_64/saxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c index 0a6bef046..0d448d5f8 100644 --- a/kernel/x86_64/saxpy_microk_sandy-2.c +++ b/kernel/x86_64/saxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c index 36e61b077..8958a33dc 100644 --- a/kernel/x86_64/sdot_microk_bulldozer-2.c +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index df367b61f..91dc928d3 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 1a27177f5..5a715d008 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c index ca13536f2..ae25d5a50 100644 --- a/kernel/x86_64/sdot_microk_sandy-2.c +++ b/kernel/x86_64/sdot_microk_sandy-2.c @@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c index 6b8b2566b..bf6a5f287 100644 --- a/kernel/x86_64/sdot_microk_steamroller-2.c +++ b/kernel/x86_64/sdot_microk_steamroller-2.c @@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 0e15761f7..15d367971 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c index 30e8b1955..89d23daf3 100644 --- a/kernel/x86_64/zaxpy_microk_haswell-2.c +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c index 233af143a..17b8b24f7 100644 --- a/kernel/x86_64/zaxpy_microk_sandy-2.c +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c index 728d09213..907b1ae00 100644 --- a/kernel/x86_64/zaxpy_microk_steamroller-2.c +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c index 30a9552d6..db9a48cce 100644 --- a/kernel/x86_64/zdot_microk_bulldozer-2.c +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 11056a3c1..9f2fc2c1d 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c index 87c5b0340..33415e26e 100644 --- a/kernel/x86_64/zdot_microk_sandy-2.c +++ b/kernel/x86_64/zdot_microk_sandy-2.c @@ -107,10 +107,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -199,10 +199,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c index 325f74ae3..87138fe9a 100644 --- a/kernel/x86_64/zdot_microk_steamroller-2.c +++ b/kernel/x86_64/zdot_microk_steamroller-2.c @@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 From b495e54310a99049c50c20425269f4b026b47dbb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:07 +0100 Subject: [PATCH 220/236] Fix declaration of input arguments in the x86_64 SCAL microkernels (#1966) * Tag arguments 0 and 1 as both input and output (see #1964) --- kernel/x86_64/cscal_microk_bulldozer-2.c | 32 +++++++++++----------- kernel/x86_64/cscal_microk_haswell-2.c | 30 ++++++++++---------- kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++---- kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++---- kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++---- kernel/x86_64/zscal_microk_bulldozer-2.c | 28 +++++++++---------- kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++----------- kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- 9 files changed, 111 insertions(+), 111 deletions(-) diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c index 3abffc4cf..31451aa6c 100644 --- a/kernel/x86_64/cscal_microk_bulldozer-2.c +++ b/kernel/x86_64/cscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c index 0a4eb683c..a04a4c4ab 100644 --- a/kernel/x86_64/cscal_microk_haswell-2.c +++ b/kernel/x86_64/cscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", // "0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" - : - : - "r" (n), // 0 - "r" (x), // 1 + : + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c index 8346e1748..e8073d485 100644 --- a/kernel/x86_64/cscal_microk_steamroller-2.c +++ b/kernel/x86_64/cscal_microk_steamroller-2.c @@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c index de53b0bc4..096662781 100644 --- a/kernel/x86_64/dscal_microk_bulldozer-2.c +++ b/kernel/x86_64/dscal_microk_bulldozer-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c index e732a2718..77ed59a4e 100644 --- a/kernel/x86_64/dscal_microk_haswell-2.c +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c index 8d855072b..9982b8e58 100644 --- a/kernel/x86_64/dscal_microk_sandy-2.c +++ b/kernel/x86_64/dscal_microk_sandy-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c index 03882d6b6..5e733ffda 100644 --- a/kernel/x86_64/zscal_microk_bulldozer-2.c +++ b/kernel/x86_64/zscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c index d9253c1ed..8c8f5b75c 100644 --- a/kernel/x86_64/zscal_microk_haswell-2.c +++ b/kernel/x86_64/zscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c index 97b07add6..c9267ee0c 100644 --- a/kernel/x86_64/zscal_microk_steamroller-2.c +++ b/kernel/x86_64/zscal_microk_steamroller-2.c @@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", From 32b0f1168ec5eb93e146245d732c5a2fa9d73282 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:39 +0100 Subject: [PATCH 221/236] Fix declaration of input arguments in the Sandybridge GER microkernels (#1967) * Tag arguments 0 and 1 as both input and output --- kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c index 2bf966a5f..e8494500f 100644 --- a/kernel/x86_64/dger_microk_sandy-2.c +++ b/kernel/x86_64/dger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c index 79180b991..14f13475b 100644 --- a/kernel/x86_64/sger_microk_sandy-2.c +++ b/kernel/x86_64/sger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 From cda81cfae0e3dc18b1c2e9d05d6e0f8e1bec3917 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 00:10:01 +0100 Subject: [PATCH 222/236] Shift transition to multithreading towards larger matrix sizes See #1886 and JuliaRobotics issue 500. trsm benchmarks on Haswell and Zen showed that with these values performance is roughly doubled for matrix sizes between 8x8 and 14x14, and still 10 to 20 percent better near the new cutoff at 32x32. --- interface/trsm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index 5c2750e79..faec03ac2 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -81,6 +81,12 @@ #endif #endif +#ifndef COMPLEX +#define SMP_FACTOR 8 +#else +#define SMP_FACTOR 4 +#endif + static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef TRMM TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, @@ -366,10 +372,10 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From bbfdd6c0fe1e7d90099fe14f1e1f2fd775a47a36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 23:01:31 +0100 Subject: [PATCH 223/236] Increase Zen SWITCH_RATIO to 16 following GEMM benchmarks on Ryzen2700X. For #1464 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index fa6730208..15ea663a8 100644 --- a/param.h +++ b/param.h @@ -605,7 +605,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 16 #ifdef ARCH_X86 From 83b5c6b92dc6f66becae1418beef60042eb92c6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Jan 2019 12:18:53 +0100 Subject: [PATCH 224/236] Fix compilation with NO_AVX=1 set fixes #1974 --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 726014033..c45ddd968 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -228,7 +228,7 @@ int support_avx2(){ } int support_avx512(){ -#ifndef NO_AVX512 +#if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From 63bbd7b0d79d41da2a7cc81139a62b81fa247640 Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Mon, 21 Jan 2019 08:35:23 +0200 Subject: [PATCH 225/236] Better support for MSVC/Windows in CMake --- CMakeLists.txt | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f..8f3abe4b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,19 @@ endif() ####### +if(MSVC AND MSVC_STATIC_CRT) + set(CompilerFlags + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + ) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() +endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -149,12 +162,6 @@ if (${DYNAMIC_ARCH}) endforeach() endif () -# Only build shared libs for MSVC -if (MSVC) - set(BUILD_SHARED_LIBS ON) -endif() - - # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) @@ -314,7 +321,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NOFORTRAN) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) + set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") @@ -327,10 +334,11 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) + file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) From f0d834b824fd5723c5cd8df01ed1aaa7a78548c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 12:32:24 +0100 Subject: [PATCH 226/236] Use VERSION_LESS for comparisons involving software version numbers --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f3abe4b8..afd9d2cf2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,7 +147,7 @@ endif () # Only generate .def for dll on MSVC and always produce pdb files for debug and release if(MSVC) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") endif() set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") @@ -173,7 +173,7 @@ endif() # Handle MSVC exports if(MSVC AND BUILD_SHARED_LIBS) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) From 24288803b3cde043bc4c10d82080509989680efb Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Tue, 22 Jan 2019 14:38:01 +0200 Subject: [PATCH 227/236] Adjust test script for correct deployment --- appveyor.yml | 2 +- utest/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 141d3a130..95f6cf7c5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1b426afe7..dc306501f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -61,7 +61,7 @@ foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_CURRENT_BINARY_DIR}) endforeach() -if (MSVC) +if (MSVC AND BUILD_SHARED_LIBS) add_custom_command(TARGET ${OpenBLAS_utest_bin} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. From 21eda8b5774aa92aecb9babba0b3eda0a992ddb9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:47:12 +0100 Subject: [PATCH 228/236] Report SkylakeX as Haswell if compiler does not support AVX512 ... or make was invoked with NO_AVX512=1 --- getarch.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/getarch.c b/getarch.c index 78ba0fefd..d03ce6e98 100644 --- a/getarch.c +++ b/getarch.c @@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#else +#define NO_AVX512 +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "skylakex" #define CORENAME "SKYLAKEX" #endif +#endif #ifdef FORCE_ATOM #define FORCE From b56b34a75cf3ae253cf8904416c6716406aad1fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:55:43 +0100 Subject: [PATCH 229/236] Syntax fix --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 95f6cf7c5..741c66291 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. From 8533aca96470d361cc5cc81da329190811951df1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Jan 2019 10:03:00 +0100 Subject: [PATCH 230/236] Avoid penalizing tall skinny matrices --- interface/trsm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index faec03ac2..f2da285de 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -82,9 +82,9 @@ #endif #ifndef COMPLEX -#define SMP_FACTOR 8 +#define SMP_FACTOR 256 #else -#define SMP_FACTOR 4 +#define SMP_FACTOR 128 #endif static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { @@ -372,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) +/* + if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; +*/ + if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) + args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From e908ac2a5145ac1a0d43e6baf39df14ade061d57 Mon Sep 17 00:00:00 2001 From: Edison Gustavo Muenz Date: Wed, 23 Jan 2019 15:09:13 +0100 Subject: [PATCH 231/236] Fix include directory of exported targets --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f..d3a9a2797 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,7 +157,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) From e882b239aa75090c7871d5848a0ead7d37bafb6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 00:45:45 +0100 Subject: [PATCH 232/236] Correct naming of getrf_parallel object fixes #1984 --- lapack/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index c0a7543ca..d48a270ab 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -63,7 +63,6 @@ if (USE_THREAD) # these do not have 'z' versions set(PARALLEL_SOURCES - ${GETRF_SRC} lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c potrf/potrf_U_parallel.c @@ -81,6 +80,10 @@ if (USE_THREAD) trtri/trtri_L_parallel.c ) + foreach (float_type ${FLOAT_TYPES}) + GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type}) + endforeach() + GenerateNamedObjects("${PARALLEL_SOURCES}") endif () From 36b844af889374934a4c5af19cf371cf29731d2e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:47:22 +0100 Subject: [PATCH 233/236] Change ARMV8 target to ARMV7 when BINARY32 is set fixes #1961 --- Makefile.system | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20d4f6492..67c8cd197 100644 --- a/Makefile.system +++ b/Makefile.system @@ -95,6 +95,9 @@ endif ifeq ($(TARGET), ZEN) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), ARMV8) +GETARCH_FLAGS := -DFORCE_ARMV7 +endif endif From 58dd7e4501ad55ca03ae1da783de72cc36345f61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:52:33 +0100 Subject: [PATCH 234/236] Change ARMV8 target to ARMV7 for BINARY=32 --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index a060d98cb..4cee7bd18 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + set(TARGET "ARMV7") + endif () endif () if (DEFINED TARGET) From 808410c2c7a6e1fe6f83c5dc7ee5c45b2d08c732 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 15:25:15 +0100 Subject: [PATCH 235/236] Fix wrong comparison that made IMIN identical to IMAX as suggested in #1990 --- kernel/arm/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index 598cba387..ffc65226e 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 86a824c97f1f4ccfe8b24678dc0fdaf4846a7055 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 15:27:21 +0100 Subject: [PATCH 236/236] Fix wrong comparison that made IMIN identical to IMAX as reported by aarnez in #1990 --- kernel/mips/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c index d9b283d2d..bf130613b 100644 --- a/kernel/mips/imin.c +++ b/kernel/mips/imin.c @@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix];