From 06ea72f5a5d758c43cac19b6c2782032f5125cb9 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Aug 2018 17:43:40 +0000 Subject: [PATCH] write saxpy_haswell kernel using C intrinsics and don't disallow inlining the intrinsics version of saxpy is more readable than the inline asm version, and in the intrinsics version there's no reason anymore to ban inlining (since the compiler has full visibility now) which gives a mid single digits improvement in performance --- kernel/x86_64/saxpy_microk_haswell-2.c | 69 +++++++++++--------------- 1 file changed, 29 insertions(+), 40 deletions(-) diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 36d1cd882..2ca8270b2 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -25,54 +25,43 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + +#ifndef __AVX512CD__ +#pragma GCC target("avx2,fma") +#endif + +#ifdef __AVX2__ + #define HAVE_KERNEL_16 1 -static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +#include static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + BLASLONG i = 0; + __m256 __alpha; - BLASLONG register i = 0; + __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); - __asm__ __volatile__ - ( - "vbroadcastss (%[alpha]), %%ymm0 \n\t" // alpha + for (; i < n; i+= 32) { + __m256 y0, y8, y16, y24; - ".p2align 4 \n\t" - "1: \n\t" + y0 = _mm256_loadu_ps(&y[i + 0]); + y8 = _mm256_loadu_ps(&y[i + 8]); + y16 = _mm256_loadu_ps(&y[i + 16]); + y24 = _mm256_loadu_ps(&y[i + 24]); - "vmovups (%[y],%[i],4), %%ymm12 \n\t" // 8 * y - "vmovups 32(%[y],%[i],4), %%ymm13 \n\t" // 8 * y - "vmovups 64(%[y],%[i],4), %%ymm14 \n\t" // 8 * y - "vmovups 96(%[y],%[i],4), %%ymm15 \n\t" // 8 * y - "vfmadd231ps (%[x],%[i],4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x - "vfmadd231ps 32(%[x],%[i],4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x - "vfmadd231ps 64(%[x],%[i],4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x - "vfmadd231ps 96(%[x],%[i],4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x - "vmovups %%ymm12, (%[y],%[i],4) \n\t" - "vmovups %%ymm13, 32(%[y],%[i],4) \n\t" - "vmovups %%ymm14, 64(%[y],%[i],4) \n\t" - "vmovups %%ymm15, 96(%[y],%[i],4) \n\t" - - "addq $32, %[i] \n\t" - "subq $32, %[n] \n\t" - "jnz 1b \n\t" - "vzeroupper \n\t" - - : - : - [i] "r" (i), // 0 - [n] "r" (n), // 1 - [x] "r" (x), // 2 - [y] "r" (y), // 3 - [alpha] "r" (alpha) // 4 - : "cc", - "%xmm0", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} + y0 += __alpha * _mm256_loadu_ps(&x[i + 0]); + y8 += __alpha * _mm256_loadu_ps(&x[i + 8]); + y16 += __alpha * _mm256_loadu_ps(&x[i + 16]); + y24 += __alpha * _mm256_loadu_ps(&x[i + 24]); + _mm256_storeu_ps(&y[i + 0], y0); + _mm256_storeu_ps(&y[i + 8], y8); + _mm256_storeu_ps(&y[i + 16], y16); + _mm256_storeu_ps(&y[i + 24], y24); + } +} +#endif