diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 69437e665..1db955776 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -429,7 +429,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; BLASLONG ndiv8_count; - double *b_scratch = (double *)aligned_alloc(64,192*k); + double *b_scratch; + posix_memalign(&b_scratch,64,192*k); double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ @@ -637,9 +638,10 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG c_pointer ++;\ } #define SAVE_m1n4 {\ - *c_pointer += _mm256_cvtsd_f64(yc1);\ - ya1 = _mm256_unpackhi_pd(yc1,yc1);\ - c_pointer[LDC] += _mm256_cvtsd_f64(ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ xb1 = _mm256_extractf128_pd(yc1,1);\ c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ xb2 = _mm_unpackhi_pd(xb1,xb1);\