Use intrinsics instead of inline asm

Intrinsics based code is generally easier to read for the non-math part
of the algorithm and it's easier to add, say, AVX512 to it later
This commit is contained in:
Arjan van de Ven 2018-08-05 14:45:54 +00:00
parent 4fb9f3b7a5
commit 732abce9f1
1 changed files with 36 additions and 55 deletions

View File

@ -25,71 +25,52 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/* Ensure that the compiler knows how to generate AVX2 instructions if it doesn't already */
#ifndef __AVX512CD_
#if )defined(__GNUC__) && __GNUC__ < 6)
#pragma GCC target("avx")
#else
#pragma GCC target("avx2,fma")
#endif
#endif
#ifdef __AVX__
#define HAVE_KERNEL_8 1 #define HAVE_KERNEL_8 1
#include <immintrin.h>
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{ {
int i = 0;
__m256d accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();
BLASLONG register i = 0; for (; i < n; i += 16) {
accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+0]);
__asm__ __volatile__ accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+4]);
( accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+8]);
"vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]);
"vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t"
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%[x],%[i],8), %%ymm12 \n\t" // 2 * x
"vmovups 32(%[x],%[i],8), %%ymm13 \n\t" // 2 * x
"vmovups 64(%[x],%[i],8), %%ymm14 \n\t" // 2 * x
"vmovups 96(%[x],%[i],8), %%ymm15 \n\t" // 2 * x
"vfmadd231pd (%[y],%[i],8), %%ymm12, %%ymm4 \n\t" // 2 * y
"vfmadd231pd 32(%[y],%[i],8), %%ymm13, %%ymm5 \n\t" // 2 * y
"vfmadd231pd 64(%[y],%[i],8), %%ymm14, %%ymm6 \n\t" // 2 * y
"vfmadd231pd 96(%[y],%[i],8), %%ymm15, %%ymm7 \n\t" // 2 * y
"addq $16 , %[i] \n\t"
"subq $16 , %[n] \n\t"
"jnz 1b \n\t"
"vextractf128 $1 , %%ymm4 , %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5 , %%xmm13 \n\t"
"vextractf128 $1 , %%ymm6 , %%xmm14 \n\t"
"vextractf128 $1 , %%ymm7 , %%xmm15 \n\t"
"vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t"
"vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t"
"vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t"
"vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t"
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
"vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
"vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t"
"vmovsd %%xmm4, (%[dot]) \n\t"
"vzeroupper \n\t"
:
:
[i] "r" (i), // 0
[n] "r" (n), // 1
[x] "r" (x), // 2
[y] "r" (y), // 3
[dot] "r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
} }
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
__m128d half_accum0;
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
half_accum0 = _mm256_extractf128_pd(accum_0, 0) + _mm256_extractf128_pd(accum_0, 1);
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
*dot = half_accum0[0];
}
#endif