From b073d759d056cef826e3906ef00a42068df46091 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Tue, 20 Oct 2020 02:16:47 +0000 Subject: [PATCH] x86_64: clobber all xmm registers after vzeroupper As observed using GCC 10 using -march=native -ftree-vectorize on Knights Landing, it is now smart enough to find clobbers inside non-inlined static functions. In particular, sgemv counted on a kernel to preserve the whole %ymm2 register (since it was not in the clobber list), but the top part was destroyed by vzeroupper. This caused many tests to fail. This patch makes sure all xmm (and ymm/zmm by extension) registers are listed as clobbered to avoid this happening, as most kernels already did correctly in fact. --- kernel/x86_64/caxpy_microk_bulldozer-2.c | 5 +++-- kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- kernel/x86_64/caxpy_microk_sandy-2.c | 2 +- kernel/x86_64/caxpy_microk_steamroller-2.c | 5 +++-- kernel/x86_64/daxpy_microk_haswell-2.c | 5 +++-- kernel/x86_64/ddot_microk_haswell-2.c | 5 +++-- kernel/x86_64/ddot_microk_piledriver-2.c | 2 ++ kernel/x86_64/ddot_microk_sandy-2.c | 5 +++-- kernel/x86_64/ddot_microk_steamroller-2.c | 1 + kernel/x86_64/dgemv_n_microk_haswell-4.c | 14 ++++++-------- kernel/x86_64/dgemv_n_microk_piledriver-4.c | 6 ++++-- kernel/x86_64/dgemv_t_microk_haswell-4.c | 2 ++ kernel/x86_64/saxpy_microk_haswell-2.c | 3 ++- kernel/x86_64/saxpy_microk_piledriver-2.c | 6 ++++-- kernel/x86_64/sdot_microk_haswell-2.c | 5 +++-- kernel/x86_64/sdot_microk_sandy-2.c | 5 +++-- kernel/x86_64/sgemv_n_microk_haswell-4.c | 14 ++++++-------- kernel/x86_64/sgemv_t_microk_haswell-4.c | 2 ++ kernel/x86_64/zaxpy_microk_bulldozer-2.c | 5 +++-- kernel/x86_64/zaxpy_microk_haswell-2.c | 2 +- kernel/x86_64/zaxpy_microk_sandy-2.c | 6 ++++-- kernel/x86_64/zaxpy_microk_steamroller-2.c | 5 +++-- 22 files changed, 63 insertions(+), 44 deletions(-) diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index ca2209340..a32558dc9 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c index b605ea34c..129ce7a49 100644 --- a/kernel/x86_64/caxpy_microk_haswell-2.c +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c index 72d37afed..564dfbd0f 100644 --- a/kernel/x86_64/caxpy_microk_sandy-2.c +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c index 7ca7af070..cc5c5de76 100644 --- a/kernel/x86_64/caxpy_microk_steamroller-2.c +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index f3682e6d7..ecc0ecbd3 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -67,8 +67,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index dbb5487f7..faac72870 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c index cc4bcd90a..0320a2e36 100644 --- a/kernel/x86_64/ddot_microk_piledriver-2.c +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c index 84493ec27..35ba86a7d 100644 --- a/kernel/x86_64/ddot_microk_sandy-2.c +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c index 27d5244ce..94c012f0d 100644 --- a/kernel/x86_64/ddot_microk_steamroller-2.c +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index da0fa2fff..c20c0a030 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", - "%xmm6", - "%xmm8", - "%xmm12", "%xmm13", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c index 466931b82..57fa426ba 100644 --- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c +++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c @@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 958fd3e0a..b398307d3 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[2]), // 6 "r" (ap[3]) // 7 : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 7099ba4c6..8cc697f05 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c index 5feea7f24..ebbcc0045 100644 --- a/kernel/x86_64/saxpy_microk_piledriver-2.c +++ b/kernel/x86_64/saxpy_microk_piledriver-2.c @@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" @@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index 91dc928d3..322f4b28c 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c index ae25d5a50..ce09b06cf 100644 --- a/kernel/x86_64/sdot_microk_sandy-2.c +++ b/kernel/x86_64/sdot_microk_sandy-2.c @@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 93e1e26e8..556dcfde5 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c index 8c370b4c0..fcabc0def 100644 --- a/kernel/x86_64/sgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[2]), // 6 "r" (ap[3]) // 7 : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 15d367971..ccb26134f 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c index 89d23daf3..8f299ea2d 100644 --- a/kernel/x86_64/zaxpy_microk_haswell-2.c +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c index 17b8b24f7..5246c72e8 100644 --- a/kernel/x86_64/zaxpy_microk_sandy-2.c +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; @@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c index 907b1ae00..88e3a680b 100644 --- a/kernel/x86_64/zaxpy_microk_steamroller-2.c +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" );