dgemm/avx512 simplify and speed up the 4x4 kernel

This commit is contained in:
Arjan van de Ven 2018-10-06 14:12:32 +00:00
parent 6d43c51ccf
commit 20c5d668fe
1 changed files with 4 additions and 22 deletions

View File

@ -333,17 +333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KERNEL4x4_SUB() \
ymm0 = _mm256_loadu_pd(AO - 16); \
ymm1 = _mm256_loadu_pd(BO - 12); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 12)); \
\
ymm4 += ymm0 * ymm1; \
\
ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 11)); \
ymm5 += ymm0 * ymm1; \
\
ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 10)); \
ymm6 += ymm0 * ymm1; \
\
ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 9)); \
ymm7 += ymm0 * ymm1; \
AO += 4; \
BO += 4;
@ -356,24 +356,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ymm6 *= ymm0; \
ymm7 *= ymm0; \
\
ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \
ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \
\
ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \
ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \
ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \
ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \
\
ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \
ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \
ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \
ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \
\
ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \
ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \
ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \
ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \
\
ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \
ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \
ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \