Add AVX512 support to dsymv_L_microk_haswell-2.c
Now that the code is written in intrinsics it's relatively easy to add AVX512 support
This commit is contained in:
parent
c202e06297
commit
f2810beafb
|
@ -46,6 +46,56 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||
temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d temp2_05, temp2_15, temp2_25, temp2_35; // temp2_0 temp2_1 temp2_2 temp2_3
|
||||
__m512d temp1_05, temp1_15, temp1_25, temp1_35;
|
||||
BLASLONG to2;
|
||||
int delta;
|
||||
|
||||
temp2_05 = _mm512_setzero_pd();
|
||||
temp2_15 = _mm512_setzero_pd();
|
||||
temp2_25 = _mm512_setzero_pd();
|
||||
temp2_35 = _mm512_setzero_pd();
|
||||
|
||||
temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0]));
|
||||
temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1]));
|
||||
temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||
temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||
|
||||
delta = (to - from) & ~7;
|
||||
to2 = from + delta;
|
||||
|
||||
|
||||
for (; from < to2; from += 8) {
|
||||
__m512d _x, _y;
|
||||
__m512d a0, a1, a2, a3;
|
||||
|
||||
_y = _mm512_loadu_pd(&y[from]);
|
||||
_x = _mm512_loadu_pd(&x[from]);
|
||||
|
||||
a0 = _mm512_loadu_pd(&a[0][from]);
|
||||
a1 = _mm512_loadu_pd(&a[1][from]);
|
||||
a2 = _mm512_loadu_pd(&a[2][from]);
|
||||
a3 = _mm512_loadu_pd(&a[3][from]);
|
||||
|
||||
_y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3;
|
||||
|
||||
temp2_05 += _x * a0;
|
||||
temp2_15 += _x * a1;
|
||||
temp2_25 += _x * a2;
|
||||
temp2_35 += _x * a3;
|
||||
|
||||
_mm512_storeu_pd(&y[from], _y);
|
||||
|
||||
};
|
||||
|
||||
temp2_0 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_05, 0), _mm512_extractf64x4_pd(temp2_05, 1));
|
||||
temp2_1 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_15, 0), _mm512_extractf64x4_pd(temp2_15, 1));
|
||||
temp2_2 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_25, 0), _mm512_extractf64x4_pd(temp2_25, 1));
|
||||
temp2_3 = _mm256_add_pd(_mm512_extractf64x4_pd(temp2_35, 0), _mm512_extractf64x4_pd(temp2_35, 1));
|
||||
|
||||
#endif
|
||||
|
||||
for (; from != to; from += 4) {
|
||||
__m256d _x, _y;
|
||||
__m256d a0, a1, a2, a3;
|
||||
|
|
Loading…
Reference in New Issue