diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index ff2a04beb..d9b380fff 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c index 0a95a68e2..e757197ba 100644 --- a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 8; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi64(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); for (; i < m4; i += 4) { for (j = 0; j < n32; j += 32) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 0881f35b2..18c797283 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -105,8 +105,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (i = 0; i < m4; i += 4) { for (j = 0; j < n4; j += 4) { diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c index 8ff79d2c8..00f42aa76 100644 --- a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c @@ -189,8 +189,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (i = 0; i < m8; i += 8) { for (j = 0; j < n16; j += 16) { @@ -235,8 +235,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - idx_lo = _mm512_loadu_epi64(permute_table2); - idx_hi = _mm512_loadu_epi64(permute_table2 + 8); + idx_lo = _mm512_loadu_si512(permute_table2); + idx_hi = _mm512_loadu_si512(permute_table2 + 8); for (j = 0; j < n32; j += 32) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -289,7 +289,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 8; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi64(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); #if !defined(B0) __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); #endif diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c index f293bf9f9..a7d87f8c4 100644 --- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 16; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi32(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); for (; i < m4; i += 4) { for (j = 0; j < n64; j += 64) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c index 8da560ef7..023f58746 100644 --- a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c @@ -215,8 +215,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, }; - __m512i idx_lo = _mm512_loadu_epi32(permute_table); - __m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); __mmask16 kc = 0xcccc; __mmask16 k3 = 0x3333; __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE @@ -311,7 +311,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 16; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi32(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); #if !defined(B0) __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); #endif