From 55b244ca0da907b27c4e0306df0a1a90a2238c6a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 12 Oct 2018 09:30:35 +0000 Subject: [PATCH 01/25] enable the SGEMM/SKX C based kernel In QA the final bug was found so now the sklyakex sgemm C based kernel can be activated.... --- kernel/x86_64/KERNEL.SKYLAKEX | 9 +- kernel/x86_64/sgemm_beta_skylakex.c | 6 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 549 --------------------- 3 files changed, 10 insertions(+), 554 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 48c81e80b..acc6356d6 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,6 +1,11 @@ include $(KERNELDIR)/KERNEL.HASWELL -SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c + +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_skylakex.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c @@ -9,5 +14,5 @@ DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c -SGEMM_BETA = ../generic/gemm_beta.c +SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index b1bf4d77a..54f9664e9 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -60,8 +60,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, if (beta == ZERO){ __m512 z_zero; + __m256 y_zero; z_zero = _mm512_setzero_ps(); + y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -71,14 +73,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, while (i > 32) { _mm512_storeu_ps(c_offset1, z_zero); - _mm512_storeu_ps(c_offset1 + 8, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); - _mm512_storeu_ps(c_offset1 + 24 , z_zero); c_offset1 += 32; i -= 32; } while (i > 8) { - _mm512_storeu_ps(c_offset1, z_zero); + _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index b2b1ab03f..10d3d22ed 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define INIT32x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row0b = _mm512_setzero_ps(); \ - row1b = _mm512_setzero_ps(); \ - row2b = _mm512_setzero_ps(); \ - row3b = _mm512_setzero_ps(); \ - row4b = _mm512_setzero_ps(); \ - row5b = _mm512_setzero_ps(); \ - row6b = _mm512_setzero_ps(); \ - row7b = _mm512_setzero_ps(); \ - -#define KERNEL32x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm0b = _mm512_loadu_ps(AOb); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - row0b += zmm0b * zmm2; \ - row1b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - row2b += zmm0b * zmm2; \ - row3b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - row4b += zmm0b * zmm2; \ - row5b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - row6b += zmm0b * zmm2; \ - row7b += zmm0b * zmm3; \ - BO += 8; \ - AO += 16; \ - AOb += 16; - - -#define SAVE32x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0b *= zmm0; \ - row1b *= zmm0; \ - row2b *= zmm0; \ - row3b *= zmm0; \ - row4b *= zmm0; \ - row5b *= zmm0; \ - row6b *= zmm0; \ - row7b *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); \ - row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ - row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ - row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ - row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ - row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ - row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ - row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ - row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ - _mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ - _mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ - _mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ - _mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ - _mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ - _mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ - _mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ - _mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ - - -#define INIT16x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row7 = _mm512_setzero_ps(); \ - -#define KERNEL16x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - BO += 8; \ - AO += 16; - - -#define SAVE16x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); - - - -/*******************************************************************************************/ - -#define INIT8x8() \ - row0 = _mm256_setzero_ps(); \ - row1 = _mm256_setzero_ps(); \ - row2 = _mm256_setzero_ps(); \ - row3 = _mm256_setzero_ps(); \ - row4 = _mm256_setzero_ps(); \ - row5 = _mm256_setzero_ps(); \ - row6 = _mm256_setzero_ps(); \ - row7 = _mm256_setzero_ps(); \ - -#define KERNEL8x8_SUB() \ - ymm0 = _mm256_loadu_ps(AO); \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += ymm0 * ymm2; \ - row1 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += ymm0 * ymm2; \ - row3 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += ymm0 * ymm2; \ - row5 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += ymm0 * ymm2; \ - row7 += ymm0 * ymm3; \ - BO += 8; \ - AO += 8; - - -#define SAVE8x8(ALPHA) \ - ymm0 = _mm256_set1_ps(ALPHA); \ - row0 *= ymm0; \ - row1 *= ymm0; \ - row2 *= ymm0; \ - row3 *= ymm0; \ - row4 *= ymm0; \ - row5 *= ymm0; \ - row6 *= ymm0; \ - row7 *= ymm0; \ - row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ - _mm256_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm256_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm256_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm256_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm256_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm256_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm256_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm256_storeu_ps(CO1 + 7 * ldc, row7); \ - - - -/*******************************************************************************************/ - -#define INIT4x8() \ - row0 = _mm_setzero_ps(); \ - row1 = _mm_setzero_ps(); \ - row2 = _mm_setzero_ps(); \ - row3 = _mm_setzero_ps(); \ - row4 = _mm_setzero_ps(); \ - row5 = _mm_setzero_ps(); \ - row6 = _mm_setzero_ps(); \ - row7 = _mm_setzero_ps(); \ - - -#define KERNEL4x8_SUB() \ - xmm0 = _mm_loadu_ps(AO); \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 4; - - -#define SAVE4x8(ALPHA) \ - xmm0 = _mm_set1_ps(ALPHA); \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ - _mm_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm_storeu_ps(CO1 + 7 * ldc, row7); \ - - -/*******************************************************************************************/ - -#define INIT2x8() \ - row0a = row0b = 0; \ - row1a = row1b = 0; \ - row2a = row2b = 0; \ - row3a = row3b = 0; \ - row4a = row4b = 0; \ - row5a = row5b = 0; \ - row6a = row6b = 0; \ - row7a = row7b = 0; \ - -#define KERNEL2x8_SUB() \ - xmm0 = *(AO); \ - xmm1 = *(AO + 1); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0a += xmm0 * xmm2; \ - row0b += xmm1 * xmm2; \ - row1a += xmm0 * xmm3; \ - row1b += xmm1 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2a += xmm0 * xmm2; \ - row2b += xmm1 * xmm2; \ - row3a += xmm0 * xmm3; \ - row3b += xmm1 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4a += xmm0 * xmm2; \ - row4b += xmm1 * xmm2; \ - row5a += xmm0 * xmm3; \ - row5b += xmm1 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6a += xmm0 * xmm2; \ - row6b += xmm1 * xmm2; \ - row7a += xmm0 * xmm3; \ - row7b += xmm1 * xmm3; \ - BO += 8; \ - AO += 2; - - -#define SAVE2x8(ALPHA) \ - xmm0 = ALPHA; \ - row0a *= xmm0; \ - row0b *= xmm0; \ - row1a *= xmm0; \ - row1b *= xmm0; \ - row2a *= xmm0; \ - row2b *= xmm0; \ - row3a *= xmm0; \ - row3b *= xmm0; \ - row4a *= xmm0; \ - row4b *= xmm0; \ - row5a *= xmm0; \ - row5b *= xmm0; \ - row6a *= xmm0; \ - row6b *= xmm0; \ - row7a *= xmm0; \ - row7b *= xmm0; \ - *(CO1 + 0 * ldc + 0) += row0a; \ - *(CO1 + 0 * ldc + 1) += row0b; \ - *(CO1 + 1 * ldc + 0) += row1a; \ - *(CO1 + 1 * ldc + 1) += row1b; \ - *(CO1 + 2 * ldc + 0) += row2a; \ - *(CO1 + 2 * ldc + 1) += row2b; \ - *(CO1 + 3 * ldc + 0) += row3a; \ - *(CO1 + 3 * ldc + 1) += row3b; \ - *(CO1 + 4 * ldc + 0) += row4a; \ - *(CO1 + 4 * ldc + 1) += row4b; \ - *(CO1 + 5 * ldc + 0) += row5a; \ - *(CO1 + 5 * ldc + 1) += row5b; \ - *(CO1 + 6 * ldc + 0) += row6a; \ - *(CO1 + 6 * ldc + 1) += row6b; \ - *(CO1 + 7 * ldc + 0) += row7a; \ - *(CO1 + 7 * ldc + 1) += row7b; \ - - - -/*******************************************************************************************/ - -#define INIT1x8() \ - row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; - -#define KERNEL1x8_SUB() \ - xmm0 = *(AO ); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 1; - - -#define SAVE1x8(ALPHA) \ - xmm0 = ALPHA; \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - *(CO1 + 0 * ldc) += row0; \ - *(CO1 + 1 * ldc) += row1; \ - *(CO1 + 2 * ldc) += row2; \ - *(CO1 + 3 * ldc) += row3; \ - *(CO1 + 4 * ldc) += row4; \ - *(CO1 + 5 * ldc) += row5; \ - *(CO1 + 6 * ldc) += row6; \ - *(CO1 + 7 * ldc) += row7; \ @@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; - - // L8_0 - while (N >= 8 && 0) { - float *CO1; - float *AO; - int i; - // L8_10 - CO1 = C; - C += 8 * ldc; - - AO = A; - - i = m; - - while (i >= 32 && 0) { - float *BO, *AOb; - // L8_11 - __m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - int kloop = K; - AOb = AO + 16 * K; - - INIT32x8() - - while (kloop > 0) { - // L12_17 - KERNEL32x8_SUB() - kloop--; - } - // L8_19 - SAVE32x8(alpha) - CO1 += 32; - AO += 16 * K; - - i -= 32; - } - while (i >= 16) { - float *BO; - // L8_11 - __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT16x8() - - while (kloop > 0) { - KERNEL16x8_SUB() - kloop--; - } - SAVE16x8(alpha) - CO1 += 16; - - i -= 16; - } - while (i >= 8) { - float *BO; - // L8_11 - __m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT8x8() - - while (kloop > 0) { - // L12_17 - KERNEL8x8_SUB() - kloop--; - } - // L8_19 - SAVE8x8(alpha) - CO1 += 8; - - i -= 8; - } - while (i >= 4) { - // L8_11 - float *BO; - __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT4x8() - // L8_16 - while (kloop > 0) { - // L12_17 - KERNEL4x8_SUB() - kloop--; - } - // L8_19 - SAVE4x8(alpha) - CO1 += 4; - - i -= 4; - } - -/************************************************************************** -* Rest of M -***************************************************************************/ - - while (i >= 2) { - float *BO; - float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - - INIT2x8() - int kloop = K; - - while (kloop > 0) { - KERNEL2x8_SUB() - kloop--; - } - SAVE2x8(alpha) - CO1 += 2; - i -= 2; - } - // L13_40 - while (i >= 1) { - float *BO; - float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - int kloop = K; - BO = B; - INIT1x8() - - while (kloop > 0) { - KERNEL1x8_SUB() - kloop--; - } - SAVE1x8(alpha) - CO1 += 1; - i -= 1; - } - - B += K * 8; - N -= 8; - } - while (N >= 4) { float *CO1; float *AO; From c3d93caa8d58e18422014c3ceb4f49ea73cd1f96 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:01:27 -0700 Subject: [PATCH 02/25] ARM64: Remove dependency of XGENE1 Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.XGENE1 | 136 ++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 index 6ee0c730c..d05754628 100644 --- a/kernel/arm64/KERNEL.XGENE1 +++ b/kernel/arm64/KERNEL.XGENE1 @@ -1 +1,135 @@ -include $(KERNELDIR)/KERNEL.ARMV8 \ No newline at end of file +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + From 162e31283276a7c108968f3309e2e3371b639bc3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:01:45 -0700 Subject: [PATCH 03/25] ARM64: Remove dependency of CORTEXA57 Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.CORTEXA57 | 47 ++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 371e488cd..2fd2c3d87 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -1,4 +1,49 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S From 8001fdcd2a6796c0747e5df25c38a082c0261b0f Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:02:16 -0700 Subject: [PATCH 04/25] ARM64: Remove dependency of THUNDERX Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.THUNDERX | 135 +++++++++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index 11b7a2ca8..e19655e8c 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -1,6 +1,133 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx.c +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot_thunderx.c +DDOTKERNEL = ddot_thunderx.c +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SDOTKERNEL=dot_thunderx.c -DDOTKERNEL=ddot_thunderx.c -DAXPYKERNEL=daxpy_thunderx.c From caf339412f9e828ffd3e43ec4b58ecd992eeff7a Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:02:40 -0700 Subject: [PATCH 05/25] ARM64: Remove dependency of THUNDERX2T99 Makefile on CORTEXA57 Makefile --- kernel/arm64/KERNEL.THUNDERX2T99 | 137 ++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index b66cd0e8b..a73d4cee8 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -1,4 +1,137 @@ -include $(KERNELDIR)/KERNEL.CORTEXA57 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c @@ -27,12 +160,12 @@ CNRM2KERNEL = scnrm2_thunderx2t99.c DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DAXPYKERNEL = daxpy_thunderx2t99.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S From 21f46a1cf2cefbdedf89878e3a6324578d0fe8ca Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:11:27 -0700 Subject: [PATCH 06/25] ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8 Currently the generic ARMV8 target uses C implementations for many routines. Replace these with the neon implementations written for THUNDERX2T99 target which are upto 6x faster for certain routines. --- driver/others/parameter.c | 4 +- interface/swap.c | 2 +- kernel/arm64/KERNEL.ARMV8 | 276 +++++++++++++++++++++++++------------- param.h | 47 ++++++- 4 files changed, 224 insertions(+), 105 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index e7332c0c4..0f2364d9f 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,7 +730,7 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) unsigned long dgemm_prefetch_size_a; unsigned long dgemm_prefetch_size_b; unsigned long dgemm_prefetch_size_c; @@ -738,7 +738,7 @@ unsigned long dgemm_prefetch_size_c; void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) dgemm_p = 160; dgemm_q = 128; dgemm_r = 4096; diff --git a/interface/swap.c b/interface/swap.c index f7642edf1..17a9868a9 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 4c6d6fb71..7e7a900fb 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -1,8 +1,3 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c @@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c @@ -30,92 +20,6 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S - -ifneq ($(OS_DARWIN)$(CROSS),11) -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S -endif - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -ifneq ($(OS_DARWIN)$(CROSS),11) -SGEMMKERNEL = sgemm_kernel_4x4.S -else -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -endif -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -136,6 +40,186 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +ifneq ($(OS_DARWIN)$(CROSS),11) +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c +endif + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifneq ($(OS_DARWIN)$(CROSS),11) + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif + +else + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +endif diff --git a/param.h b/param.h index ded9fe0b8..c7952e113 100644 --- a/param.h +++ b/param.h @@ -2583,6 +2583,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(ARMV8) + +#if defined(OS_DARWIN) && defined(CROSS) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2590,13 +2592,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#if defined(OS_DARWIN) && defined(CROSS) #define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL N 2 -#else -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 -#endif +#define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 @@ -2622,10 +2619,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#define SYMV_P 16 +#else + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p + +#define SGEMM_DEFAULT_Q sgemm_q +#define DGEMM_DEFAULT_Q dgemm_q +#define CGEMM_DEFAULT_Q cgemm_q +#define ZGEMM_DEFAULT_Q zgemm_q + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r #define SYMV_P 16 #endif +#endif + #if defined(THUNDERX) #define SNUMOPT 2 #define DNUMOPT 2 From 898a8dcaba6d86358ae73575926f8689d6ede155 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 10:55:04 +0300 Subject: [PATCH 07/25] init From c7bbf9c987a0473aafbd8a4f48ed07cd52fccc38 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 11:13:29 +0300 Subject: [PATCH 08/25] Attempt to tame _hemv threading #1820 --- interface/zhemv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/interface/zhemv.c b/interface/zhemv.c index d1996ad69..8995ca1c2 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -195,7 +195,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + // see graph in issue #1820 for explanation and room for improvement + if (n<362) { + nthreads = 1 ; + } else { + nthreads = num_cpu_avail(2); + }; if (nthreads == 1) { #endif From a293bdcd5eaa610ed960264c4e1c48af662502e9 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 21:37:53 +0300 Subject: [PATCH 09/25] re-arrange new code for readability --- interface/zhemv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/interface/zhemv.c b/interface/zhemv.c index 8995ca1c2..9c31f31d9 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -43,6 +43,10 @@ #include "functable.h" #endif +// this is smallest dimension N of square input a to permit threading +// see graph in issue #1820 for explanation +#define MULTI_THREAD_MINIMAL 362 + #ifdef XDOUBLE #define ERROR_NAME "XHEMV " #elif defined(DOUBLE) @@ -195,8 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - // see graph in issue #1820 for explanation and room for improvement - if (n<362) { + if (n Date: Thu, 18 Oct 2018 04:51:24 -0700 Subject: [PATCH 10/25] ARM64: Remove XGENE1 references Remove XGENE1 target as the implementation for the same is incomplete. Moreover whoever wishes to use on XGENE1 can use the generic ARMV8 target as there are no XGENE1 specific optimizations in OpenBLAS. --- kernel/arm64/KERNEL.XGENE1 | 135 ------------------------------------- 1 file changed, 135 deletions(-) delete mode 100644 kernel/arm64/KERNEL.XGENE1 diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 deleted file mode 100644 index d05754628..000000000 --- a/kernel/arm64/KERNEL.XGENE1 +++ /dev/null @@ -1,135 +0,0 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = sgemm_kernel_4x4.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - From d50abc8903089089357766d3ada7db090ff6e63d Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:02:23 -0700 Subject: [PATCH 11/25] ARM64: Move parameters from parameter.c to param.h Remove the runtime setting of P, Q, R parameters for targets ARMV8, THUNDERX2T99. Instead set them as constants in param.h at compile time. --- driver/others/parameter.c | 27 ----------- kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S | 10 ++-- param.h | 48 ++++++++++---------- 3 files changed, 27 insertions(+), 58 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 0f2364d9f..8bf7da78b 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,35 +730,8 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) -unsigned long dgemm_prefetch_size_a; -unsigned long dgemm_prefetch_size_b; -unsigned long dgemm_prefetch_size_c; -#endif - void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) - dgemm_p = 160; - dgemm_q = 128; - dgemm_r = 4096; - - sgemm_p = 128; - sgemm_q = 352; - sgemm_r = 4096; - - cgemm_p = 128; - cgemm_q = 224; - cgemm_r = 4096; - - zgemm_p = 128; - zgemm_q = 112; - zgemm_r = 4096; - - dgemm_prefetch_size_a = 3584; - dgemm_prefetch_size_b = 512; - dgemm_prefetch_size_c = 128; -#endif } #endif diff --git a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S index 598db6e0c..d1551ffea 100644 --- a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S @@ -943,13 +943,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] - - ldr A_PRE_SIZE, =dgemm_prefetch_size_a - ldr A_PRE_SIZE, [A_PRE_SIZE] - ldr B_PRE_SIZE, =dgemm_prefetch_size_b - ldr B_PRE_SIZE, [B_PRE_SIZE] - ldr C_PRE_SIZE, =dgemm_prefetch_size_c - ldr C_PRE_SIZE, [C_PRE_SIZE] + mov A_PRE_SIZE, #3584 + mov B_PRE_SIZE, #512 + mov C_PRE_SIZE, #128 add A_PRE_SIZE_64, A_PRE_SIZE, #64 add B_PRE_SIZE_64, B_PRE_SIZE, #64 diff --git a/param.h b/param.h index c7952e113..e4ec1b2b5 100644 --- a/param.h +++ b/param.h @@ -2641,20 +2641,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p -#define CGEMM_DEFAULT_P cgemm_p -#define ZGEMM_DEFAULT_P zgemm_p +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q sgemm_q -#define DGEMM_DEFAULT_Q dgemm_q -#define CGEMM_DEFAULT_Q cgemm_q -#define ZGEMM_DEFAULT_Q zgemm_q +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif @@ -2720,20 +2720,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p -#define CGEMM_DEFAULT_P cgemm_p -#define ZGEMM_DEFAULT_P zgemm_p +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q sgemm_q -#define DGEMM_DEFAULT_Q dgemm_q -#define CGEMM_DEFAULT_Q cgemm_q -#define ZGEMM_DEFAULT_Q zgemm_q +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif From e7b66cd36e12845701aaae979c29120439294368 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:13:02 -0700 Subject: [PATCH 12/25] ARM64: Fix DYNAMIC_ARCH compilation for cores which dont use GEMM3M --- kernel/Makefile | 4 ++ kernel/setparam-ref.c | 85 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index a0a8fcd21..923ffc363 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -88,7 +88,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h +ifeq ($(USE_GEMM3M), 1) + $(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@ +else $(CC) -c $(CFLAGS) $< -o $@ +endif setparam$(TSUFFIX).c : setparam-ref.c sed 's/TS/$(TSUFFIX)/g' $< > $(@F) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f654de110..e035d5bda 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -294,6 +294,8 @@ gotoblas_t TABLE_NAME = { chemm_outcopyTS, chemm_oltcopyTS, 0, 0, 0, + +#if defined(USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), #else @@ -324,6 +326,33 @@ gotoblas_t TABLE_NAME = { chemm3m_oucopybTS, chemm3m_olcopybTS, chemm3m_oucopyrTS, chemm3m_olcopyrTS, chemm3m_oucopyiTS, chemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK cneg_tcopyTS, claswp_ncopyTS, @@ -400,6 +429,7 @@ gotoblas_t TABLE_NAME = { zhemm_outcopyTS, zhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) #ifdef ZGEMM3M_DEFAULT_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), #else @@ -430,6 +460,33 @@ gotoblas_t TABLE_NAME = { zhemm3m_oucopybTS, zhemm3m_olcopybTS, zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK zneg_tcopyTS, zlaswp_ncopyTS, @@ -503,6 +560,7 @@ gotoblas_t TABLE_NAME = { xhemm_outcopyTS, xhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), xgemm3m_kernelTS, @@ -528,6 +586,33 @@ gotoblas_t TABLE_NAME = { xhemm3m_oucopybTS, xhemm3m_olcopybTS, xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK xneg_tcopyTS, xlaswp_ncopyTS, From af2837c392344c54e03e517902ae4fa4983570c0 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 22 Oct 2018 01:49:16 -0700 Subject: [PATCH 13/25] ARM64: Remove #define ARMV8 for THUNDERX --- cpuid_arm64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a42346c88..17078fe7f 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -237,7 +237,6 @@ void get_cpuconfig(void) break; case CPU_THUNDERX: - printf("#define ARMV8\n"); printf("#define THUNDERX\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 128\n"); From d5aeff636f2d8ba99d1e5ed511c3770970f440af Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:15:45 -0700 Subject: [PATCH 14/25] ARM64: Enable DYNAMIC_ARCH Enable DYNAMIC_ARCH feature on ARM64. This patch uses the cpuid feature in linux kernel to detect the core type at runtime (https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt). If this feature is missing in kernel, then the user should use the OPENBLAS_CORETYPE env variable to select the desired core type. --- Makefile.system | 7 ++ driver/others/Makefile | 8 ++ driver/others/dynamic_arm64.c | 198 +++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.ARMV8 | 48 ++++---- kernel/arm64/KERNEL.CORTEXA57 | 32 ++--- kernel/arm64/KERNEL.THUNDERX | 16 +-- kernel/arm64/KERNEL.THUNDERX2T99 | 32 ++--- kernel/setparam-ref.c | 73 ++++++++++++ 8 files changed, 350 insertions(+), 64 deletions(-) create mode 100644 driver/others/dynamic_arm64.c diff --git a/Makefile.system b/Makefile.system index b4cd4222a..7847c7525 100644 --- a/Makefile.system +++ b/Makefile.system @@ -510,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT) #CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' endif +ifeq ($(ARCH), arm64) +DYNAMIC_CORE = ARMV8 +DYNAMIC_CORE += CORTEXA57 +DYNAMIC_CORE += THUNDERX +DYNAMIC_CORE += THUNDERX2T99 +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= diff --git a/driver/others/Makefile b/driver/others/Makefile index e61ba7bc8..3dc2e7c1b 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -15,7 +15,11 @@ endif # COMMONOBJS += info.$(SUFFIX) ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +COMMONOBJS += dynamic_arm64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c endif ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c new file mode 100644 index 000000000..b4ce6b67d --- /dev/null +++ b/driver/others/dynamic_arm64.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include +#include + +extern gotoblas_t gotoblas_ARMV8; +extern gotoblas_t gotoblas_CORTEXA57; +extern gotoblas_t gotoblas_THUNDERX; +extern gotoblas_t gotoblas_THUNDERX2T99; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 4 + +/* + * In case asm/hwcap.h is outdated on the build system, make sure + * that HWCAP_CPUID is defined + */ +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif + +#define get_cpu_ftr(id, var) ({ \ + asm("mrs %0, "#id : "=r" (var)); \ + }) + +static char *corename[] = { + "armv8", + "cortexa57", + "thunderx", + "thunderx2t99", + "unknown" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; + if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; + if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; + if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i ; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_ARMV8); + case 1: return (&gotoblas_CORTEXA57); + case 2: return (&gotoblas_THUNDERX); + case 3: return (&gotoblas_THUNDERX2T99); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int implementer, variant, part, arch, revision, midr_el1; + + if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { + char coremsg[128]; + snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); + openblas_warning(1, coremsg); + return NULL; + } + + get_cpu_ftr(MIDR_EL1, midr_el1); + /* + * MIDR_EL1 + * + * 31 24 23 20 19 16 15 4 3 0 + * ----------------------------------------------------------------- + * | Implementer | Variant | Architecture | Part Number | Revision | + * ----------------------------------------------------------------- + */ + implementer = (midr_el1 >> 24) & 0xFF; + part = (midr_el1 >> 4) & 0xFFF; + + switch(implementer) + { + case 0x41: // ARM + switch (part) + { + case 0xd07: // Cortex A57 + case 0xd08: // Cortex A72 + case 0xd03: // Cortex A53 + return &gotoblas_CORTEXA57; + } + break; + case 0x42: // Broadcom + switch (part) + { + case 0x516: // Vulcan + return &gotoblas_THUNDERX2T99; + } + break; + case 0x43: // Cavium + switch (part) + { + case 0x0a1: // ThunderX + return &gotoblas_THUNDERX; + case 0x0af: // ThunderX2 + return &gotoblas_THUNDERX2T99; + } + break; + } + return NULL; +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_ARMV8; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 7e7a900fb..bcecd0026 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -113,13 +113,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -134,8 +134,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -146,34 +146,34 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S @@ -201,25 +201,25 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 2fd2c3d87..04d6940d7 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -111,13 +111,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -132,8 +132,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -144,32 +144,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index e19655e8c..cb02c7bc5 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -89,26 +89,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = sgemm_kernel_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a73d4cee8..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -74,13 +74,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -94,8 +94,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -106,32 +106,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index e035d5bda..6d4028b0b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -646,6 +646,78 @@ gotoblas_t TABLE_NAME = { }; +#if defined(ARCH_ARM64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif + +} +#else // defined(ARCH_ARM64) #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1231,3 +1303,4 @@ static void init_parameter(void) { } +#endif //defined(ARCH_ARM64) From 64ca44873bd9d960c63456a43fd565c56514e895 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Oct 2018 18:36:55 +0100 Subject: [PATCH 15/25] Fix detection of Ryzen2 (missing CORE_ZEN) --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 512ad877b..8e4a7cb84 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2009,6 +2009,8 @@ int get_coretype(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // Ryzen 2 if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; From 38cf5d93647bf5ffb5fe3e17447eba0c157bb305 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Sun, 28 Oct 2018 21:16:52 +0000 Subject: [PATCH 16/25] ensure that threading has been initialized in the first place before calling openblas_set_num_threads --- driver/others/blas_server.c | 5 +++++ driver/others/blas_server_win32.c | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6a25e2d07..e5db1804f 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) { long i; +#ifdef SMP_SERVER + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + if (num_threads < 1) num_threads = blas_num_threads; #ifndef NO_AFFINITY diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 02a25ac39..bae344c59 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){ void goto_set_num_threads(int num_threads) { - long i; + long i; + +#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif if (num_threads < 1) num_threads = blas_cpu_number; From 326d394a0fbcc8226bb958f523ca1005696c33b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Oct 2018 18:38:22 +0100 Subject: [PATCH 17/25] Add get_num_procs implementation for AIX (and copy HAIKU implementation to the non-TLS version of the code as well) --- driver/others/memory.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 4a8e6c067..25f198623 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -259,6 +259,16 @@ int get_num_procs(void) { } #endif +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + + + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -1738,6 +1748,22 @@ int get_num_procs(void) { return nums; } #endif + +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif #ifdef OS_WINDOWS From 7b5aea52bb105c15d7e80e0749b80f6bfb0566b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Oct 2018 21:50:34 +0100 Subject: [PATCH 18/25] Accomodate AIX install, which has different syntax for #1803 --- Makefile.install | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Makefile.install b/Makefile.install index fa657beba..7aa477cf0 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,6 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif +ifneq (($OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif + ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif +else +#install on AIX has different options syntax +ifndef NO_LAPACKE + @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" +endif + +#for install static library +ifndef NO_STATIC + @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @install -M 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +endif +#for install shared library +ifndef NO_SHARED + @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @install -M 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) +endif + +endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" From dcc5d6291e7b02761acfb6161c04ba1f8f25b502 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:42:09 +0000 Subject: [PATCH 19/25] skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case in the threading code there are cases where N or M can become 0, and the optimized beta code did not handle this well, leading to a crash during the audit for the crash a few edge conditions on the if statements were found and fixed as well --- kernel/x86_64/dgemm_beta_skylakex.c | 6 ++++-- kernel/x86_64/sgemm_beta_skylakex.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 384e9f60b..6a824c9b5 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (m == 0 || n == 0) + return 0; c_offset = c; @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm512_storeu_pd(c_offset1, z_zero); c_offset1 += 8; i -= 8; diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 54f9664e9..4e40acadf 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (n == 0 || m == 0) + return; c_offset = c; @@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; From 5b708e5eb1b17af9c45e0da2993da8a4756cb912 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:43:20 +0000 Subject: [PATCH 20/25] sgemm/dgemm: add a way for an arch kernel to specify prefered sizes The current gemm threading code can make very unfortunate choices, for example on my 10 core system a 1024x1024x1024 matrix multiply ends up chunking into blocks of 102... which is not a vector friendly size and performance ends up horrible. this patch adds a helper define where an architecture can specify a preference for size multiples. This is different from existing defines that are minimum sizes and such. The performance increase with this patch for the 1024x1024x1024 sgemm is 2.3x (!!) --- driver/level3/level3_thread.c | 22 ++++++++++++++++++++++ param.h | 1 + 2 files changed, 23 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..de29247d4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -48,6 +48,10 @@ #define SWITCH_RATIO 2 #endif +#ifndef GEMM_PREFERED_SIZE +#define GEMM_PREFERED_SIZE 1 +#endif + //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, return 0; } +static int round_up(int remainder, int width, int multiple) +{ + if (multiple > remainder || width <= multiple) + return width; + width = (width + multiple - 1) / multiple; + width = width * multiple; + return width; +} + + static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { @@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); + + width = round_up(m, width, GEMM_PREFERED_SIZE); + m -= width; + if (m < 0) width = width + m; range_M[num_parts + 1] = range_M[num_parts] + width; + num_parts ++; } for (i = num_parts; i < MAX_CPU_NUMBER; i++) { @@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < SWITCH_RATIO) { width = SWITCH_RATIO; } + width = round_up(n, width, GEMM_PREFERED_SIZE); + n -= width; if (n < 0) width = width + n; range_N[num_parts + 1] = range_N[num_parts] + width; + num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { diff --git a/param.h b/param.h index e4ec1b2b5..d1b211584 100644 --- a/param.h +++ b/param.h @@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 32 #ifdef ARCH_X86 From b0255231979ac40444fea06bc8958731fdcdef7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Nov 2018 18:26:08 +0100 Subject: [PATCH 21/25] Use installbsd on AIX (and fix misplaced parenthesis from previous commit). See #1803 --- Makefile.install | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile.install b/Makefile.install index 7aa477cf0..069c96c6a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,7 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif -ifneq (($OSNAME), AIX) +ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -99,23 +99,23 @@ else #install on AIX has different options syntax ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -M 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -M 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) From 9c177d270b7ae78c4542a15ec02d8cab9cc7f367 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Nov 2018 18:50:25 +0100 Subject: [PATCH 22/25] Restore Android/ARMv7 build fix from #778 for #1811 --- lapack-netlib/LAPACKE/include/lapacke_config.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 1e2509bf0..8262c3488 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -34,6 +34,13 @@ #ifndef _LAPACKE_CONFIG_H_ #define _LAPACKE_CONFIG_H_ +// For Android prior to API 21 (no include) +#if defined(__ANDROID__) +#if __ANDROID_API__ < 21 +#define LAPACK_COMPLEX_STRUCTURE +#endif +#endif + #ifdef __cplusplus #if defined(LAPACK_COMPLEX_CPP) #include From a931afe269efc21a6710376254fb14d7bed085d8 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 09:39:05 +0000 Subject: [PATCH 23/25] init From 3fd41313fc2c36ea55a5e3aaf02cf2734f2d18c5 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 09:40:13 +0000 Subject: [PATCH 24/25] add low bound for number of buffers --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 6c3d5b15e..60da2416a 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ From 9531d0e1757dc0edd64c5c439d65fb236195410a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 17:51:24 +0000 Subject: [PATCH 25/25] lets fit it in one 4k page --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 60da2416a..7fcd5e316 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) +#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_