From 20c5d668fe316d6f431a34f8734600194644e736 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 14:12:32 +0000 Subject: [PATCH] dgemm/avx512 simplify and speed up the 4x4 kernel --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 26 ++++------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index b5693ea2c..bb121ca69 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -333,17 +333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define KERNEL4x4_SUB() \ ymm0 = _mm256_loadu_pd(AO - 16); \ - ymm1 = _mm256_loadu_pd(BO - 12); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 12)); \ \ ymm4 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 11)); \ ymm5 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 10)); \ ymm6 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 9)); \ ymm7 += ymm0 * ymm1; \ AO += 4; \ BO += 4; @@ -356,24 +356,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ymm6 *= ymm0; \ ymm7 *= ymm0; \ \ - ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ - ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ - \ - ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ - ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ - ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ - ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ - \ - ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ - ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ - ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ - ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ - \ - ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ - ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ - ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ - ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ - \ ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \