dgemm/avx512 simplify and speed up the 4x4 kernel
This commit is contained in:
parent
6d43c51ccf
commit
20c5d668fe
|
@ -333,17 +333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define KERNEL4x4_SUB() \
|
||||
ymm0 = _mm256_loadu_pd(AO - 16); \
|
||||
ymm1 = _mm256_loadu_pd(BO - 12); \
|
||||
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 12)); \
|
||||
\
|
||||
ymm4 += ymm0 * ymm1; \
|
||||
\
|
||||
ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
|
||||
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 11)); \
|
||||
ymm5 += ymm0 * ymm1; \
|
||||
\
|
||||
ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \
|
||||
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 10)); \
|
||||
ymm6 += ymm0 * ymm1; \
|
||||
\
|
||||
ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
|
||||
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 9)); \
|
||||
ymm7 += ymm0 * ymm1; \
|
||||
AO += 4; \
|
||||
BO += 4;
|
||||
|
@ -356,24 +356,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ymm6 *= ymm0; \
|
||||
ymm7 *= ymm0; \
|
||||
\
|
||||
ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \
|
||||
ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \
|
||||
\
|
||||
ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \
|
||||
ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \
|
||||
ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \
|
||||
ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \
|
||||
\
|
||||
ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \
|
||||
ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \
|
||||
ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \
|
||||
ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \
|
||||
\
|
||||
ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \
|
||||
ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \
|
||||
ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \
|
||||
ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \
|
||||
\
|
||||
ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \
|
||||
ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \
|
||||
ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \
|
||||
|
|
Loading…
Reference in New Issue