BLD: Support x86_64 and arm64 architectures

This commit is contained in:
Mateusz Sokół 2024-08-06 18:55:54 +02:00
parent dfa1e8f8f3
commit af0e7f1c8a
14 changed files with 2863 additions and 1011 deletions

View File

@ -25,7 +25,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-13]
os: [ubuntu-latest, macos-13, macos-latest]
build: [meson]
fortran: [gfortran]
openmp: [0]

View File

@ -24,19 +24,27 @@ _test_input_array = {
'sources': testl3_src,
'input_file': '?in3',
},
'l3_3m': {
'base': 'x?cblat3_3m',
'has_dat': true,
'types': ['c', 'z'],
'sources': testl3_3m_src,
'input_file': '?in3_3m',
},
}
lvls = ['l1', 'l2', 'l3']
if conf_hdat.has('ARCH_X86_64') or conf_hdat.has('ARCH_X86')
_test_input_array += {
'l3_3m': {
'base': 'x?cblat3_3m',
'has_dat': true,
'types': ['c', 'z'],
'sources': testl3_3m_src,
'input_file': '?in3_3m',
}
}
lvls += 'l3_3m'
endif
_test_runner = executable('test_runner', sources: ['test_runner.c'], install: false)
ctest_inc = _inc + [include_directories('.')]
foreach lvl : ['l1', 'l2', 'l3', 'l3_3m']
foreach lvl : lvls
details = _test_input_array[lvl]
foreach type : details['types']

59
kernel/arm64/meson.build Normal file
View File

@ -0,0 +1,59 @@
arm64_base_dict = {
'?sum': {
's': {
'_k': 'arm64/sum.S',
},
'd': {
'_k': 'arm64/sum.S',
},
'c': {
'_k': 'arm64/csum.S'
},
'z': {
'_k': 'arm64/zsum.S',
},
},
'?nrm2': {
's': {
'_k': 'arm/nrm2.c',
},
'd': {
'_k': 'arm/nrm2.c',
},
'c': {
'_k': 'arm/znrm2.c',
},
'z': {
'_k': 'arm/znrm2.c',
},
},
'?cabs': {
's': {
'1': 'generic/cabs.c',
},
'd': {
'1': 'generic/cabs.c',
},
},
'?lsame': {
'': {
'': 'generic/lsame.c',
}
},
'?gemm': {
's': {
'_beta': 'generic/gemm_beta.c',
},
'd': {
'_beta': 'generic/gemm_beta.c',
},
'c': {
'_beta': 'generic/zgemm_beta.c',
},
'z': {
'_beta': 'generic/zgemm_beta.c',
},
},
}
subdir('meson_armv8')

View File

@ -0,0 +1,374 @@
arm64_armv8_dict = {
'?amin': {
's': {
'_k': 'arm/amin.c',
},
'd': {
'_k': 'arm/amin.c',
},
'c': {
'_k': 'arm/zamin.c',
},
'z': {
'_k': 'arm/zamin.c',
},
},
'?max': {
's': {
'_k': 'arm/max.c',
},
'd': {
'_k': 'arm/max.c',
},
},
'?min': {
's': {
'_k': 'arm/min.c',
},
'd': {
'_k': 'arm/min.c',
},
},
'i?amin': {
's': {
'_k': 'arm/iamin.c',
},
'd': {
'_k': 'arm/iamin.c',
},
'c': {
'_k': 'arm/izamin.c',
},
'z': {
'_k': 'arm/izamin.c',
},
},
'i?max': {
's': {
'_k': 'arm/imax.c',
},
'd': {
'_k': 'arm/imax.c',
},
},
'i?min': {
's': {
'_k': 'arm/imin.c',
},
'd': {
'_k': 'arm/imin.c',
},
},
'?trsm_kernel': {
's': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
},
'd': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
},
'c': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
'z': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
},
'?amax': {
's': {
'_k': 'arm64/amax.S',
},
'd': {
'_k': 'arm64/amax.S',
},
'c': {
'_k': 'arm64/zamax.S',
},
'z': {
'_k': 'arm64/zamax.S',
},
},
'?axpy': {
's': {
'_k': 'arm64/axpy.S',
},
'd': {
'_k': 'arm64/axpy.S',
},
'c': {
'_k': 'arm64/zaxpy.S',
},
'z': {
'_k': 'arm64/zaxpy.S',
},
},
'?axpyc': {
'c': {
'_k': 'arm64/zaxpy.S',
},
'z': {
'_k': 'arm64/zaxpy.S',
},
},
'?rot': {
's': {
'_k': 'arm64/rot.S',
},
'd': {
'_k': 'arm64/rot.S',
},
'cs': {
'_k': 'arm64/zrot.S',
},
'zd': {
'_k': 'arm64/zrot.S',
}
},
'?scal': {
's': {
'_k': 'arm64/scal.S',
},
'd': {
'_k': 'arm64/scal.S',
},
'c': {
'_k': 'arm64/zscal.S',
},
'z': {
'_k': 'arm64/zscal.S',
},
},
'?gemv': {
's': {
'_n': 'arm64/gemv_n.S',
'_t': 'arm64/gemv_t.S',
},
'd': {
'_n': 'arm64/gemv_n.S',
'_t': 'arm64/gemv_t.S',
},
'c': {
'_n': 'arm64/zgemv_n.S',
'_t': 'arm64/zgemv_t.S',
'_r': 'arm64/zgemv_n.S',
'_c': 'arm64/zgemv_t.S',
'_o': 'arm64/zgemv_n.S',
'_u': 'arm64/zgemv_t.S',
'_s': 'arm64/zgemv_n.S',
'_d': 'arm64/zgemv_t.S',
},
'z': {
'_n': 'arm64/zgemv_n.S',
'_t': 'arm64/zgemv_t.S',
'_r': 'arm64/zgemv_n.S',
'_c': 'arm64/zgemv_t.S',
'_o': 'arm64/zgemv_n.S',
'_u': 'arm64/zgemv_t.S',
'_s': 'arm64/zgemv_n.S',
'_d': 'arm64/zgemv_t.S',
},
},
'?asum': {
's': {
'_k': 'arm64/asum.S',
},
'd': {
'_k': 'arm64/asum.S',
},
'c': {
'_k': 'arm64/casum.S',
},
'z': {
'_k': 'arm64/zasum.S',
},
},
'?copy': {
's': {
'_k': 'arm64/copy.S',
},
'd': {
'_k': 'arm64/copy.S',
},
'c': {
'_k': 'arm64/copy.S',
},
'z': {
'_k': 'arm64/copy.S',
},
},
'?swap': {
's': {
'_k': 'arm64/swap.S',
},
'd': {
'_k': 'arm64/swap.S',
},
'c': {
'_k': 'arm64/swap.S',
},
'z': {
'_k': 'arm64/swap.S',
},
},
'i?amax': {
's': {
'_k': 'arm64/iamax.S',
},
'd': {
'_k': 'arm64/iamax.S',
},
'c': {
'_k': 'arm64/izamax.S',
},
'z': {
'_k': 'arm64/izamax.S',
},
},
'?nrm2': {
's': {
'_k': 'arm64/nrm2.S',
},
'd': {
'_k': 'arm64/nrm2.S',
},
'c': {
'_k': 'arm64/znrm2.S',
},
'z': {
'_k': 'arm64/znrm2.S',
},
},
'?dot': {
's': {
'_k': 'generic/dot.c',
},
'd': {
'_k': 'arm64/dot.S',
},
},
'?dotc': {
'c': {
'_k': 'arm64/zdot.S',
},
'z': {
'_k': 'arm64/zdot.S',
},
},
'?dotu': {
'c': {
'_k': 'arm64/zdot.S',
},
'z': {
'_k': 'arm64/zdot.S',
},
},
'?dsdot': {
's': {
'_k': 'arm64/dot.S',
},
'': {
'_k': 'arm64/dot.S',
},
},
'?gemm': {
's': {
'_beta': 'arm64/sgemm_beta.S',
'_incopy': 'generic/gemm_ncopy_16.c',
'_itcopy': 'arm64/sgemm_tcopy_16.S',
'_oncopy': 'arm64/sgemm_ncopy_4.S',
'_otcopy': 'generic/gemm_tcopy_4.c',
},
'd': {
'_beta': 'arm64/dgemm_beta.S',
'_incopy': 'arm64/dgemm_ncopy_8.S',
'_itcopy': 'arm64/dgemm_tcopy_8.S',
'_oncopy': 'arm64/dgemm_ncopy_4.S',
'_otcopy': 'arm64/dgemm_tcopy_4.S',
},
'c': {
'_incopy': 'generic/zgemm_ncopy_8.c',
'_itcopy': 'generic/zgemm_tcopy_8.c',
'_oncopy': 'generic/zgemm_ncopy_4.c',
'_otcopy': 'generic/zgemm_tcopy_4.c'
},
'z': {
'_incopy': 'generic/zgemm_ncopy_4.c',
'_itcopy': 'generic/zgemm_tcopy_4.c',
'_oncopy': 'generic/zgemm_ncopy_4.c',
'_otcopy': 'generic/zgemm_tcopy_4.c'
},
},
'?trmm_kernel': {
's': {
'_LN': 'arm64/strmm_kernel_16x4.S',
'_LT': 'arm64/strmm_kernel_16x4.S',
'_RN': 'arm64/strmm_kernel_16x4.S',
'_RT': 'arm64/strmm_kernel_16x4.S',
},
'd': {
'_LN': 'arm64/dtrmm_kernel_8x4.S',
'_LT': 'arm64/dtrmm_kernel_8x4.S',
'_RN': 'arm64/dtrmm_kernel_8x4.S',
'_RT': 'arm64/dtrmm_kernel_8x4.S',
},
'c': {
'_LN': 'arm64/ctrmm_kernel_8x4.S',
'_LT': 'arm64/ctrmm_kernel_8x4.S',
'_LR': 'arm64/ctrmm_kernel_8x4.S',
'_LC': 'arm64/ctrmm_kernel_8x4.S',
'_RN': 'arm64/ctrmm_kernel_8x4.S',
'_RT': 'arm64/ctrmm_kernel_8x4.S',
'_RR': 'arm64/ctrmm_kernel_8x4.S',
'_RC': 'arm64/ctrmm_kernel_8x4.S',
},
'z': {
'_LN': 'arm64/ztrmm_kernel_4x4.S',
'_LT': 'arm64/ztrmm_kernel_4x4.S',
'_LR': 'arm64/ztrmm_kernel_4x4.S',
'_LC': 'arm64/ztrmm_kernel_4x4.S',
'_RN': 'arm64/ztrmm_kernel_4x4.S',
'_RT': 'arm64/ztrmm_kernel_4x4.S',
'_RR': 'arm64/ztrmm_kernel_4x4.S',
'_RC': 'arm64/ztrmm_kernel_4x4.S',
},
},
'?gemm_kernel': {
's': {
'': 'arm64/sgemm_kernel_16x4.S',
},
'd': {
'': 'arm64/dgemm_kernel_8x4.S',
},
'c': {
'_n': 'arm64/cgemm_kernel_8x4.S',
'_l': 'arm64/cgemm_kernel_8x4.S',
'_r': 'arm64/cgemm_kernel_8x4.S',
'_b': 'arm64/cgemm_kernel_8x4.S',
},
'z': {
'_n': 'arm64/zgemm_kernel_4x4.S',
'_l': 'arm64/zgemm_kernel_4x4.S',
'_r': 'arm64/zgemm_kernel_4x4.S',
'_b': 'arm64/zgemm_kernel_4x4.S',
},
},
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,513 @@
base_dict = {
# Level 1 BLAS
'?axpy': {
's': {
'_k': 'arm/axpy.c',
},
'd': {
'_k': 'arm/axpy.c',
},
'c': {
'_k': 'arm/zaxpy.c',
},
'z': {
'_k': 'arm/zaxpy.c',
},
},
'?axpby': {
's': {
'_k': 'arm/axpby.c',
},
'd': {
'_k': 'arm/axpby.c',
},
'c': {
'_k': 'arm/zaxpby.c',
},
'z': {
'_k': 'arm/zaxpby.c',
},
},
# Level 2 BLAS
'?symv': {
's': {
'_U': 'generic/symv_k.c',
'_L': 'generic/symv_k.c',
},
'd': {
'_U': 'generic/symv_k.c',
'_L': 'generic/symv_k.c',
},
'c': {
'_U': 'generic/zsymv_k.c',
'_L': 'generic/zsymv_k.c',
},
'z': {
'_U': 'generic/zsymv_k.c',
'_L': 'generic/zsymv_k.c',
},
},
'?ger': {
's': {
'_k': 'generic/ger.c',
},
'd': {
'_k': 'generic/ger.c',
},
},
'?geru': {
'c': {
'_k': 'generic/zger.c',
},
'z': {
'_k': 'generic/zger.c',
},
},
'?gerc': {
'c': {
'_k': 'generic/zger.c',
},
'z': {
'_k': 'generic/zger.c',
},
},
'?gerv': {
'c': {
'_k': 'generic/zger.c',
},
'z': {
'_k': 'generic/zger.c',
},
},
'?hemv': {
'c': {
'_U': 'generic/zhemv_k.c',
'_L': 'generic/zhemv_k.c',
'_V': 'generic/zhemv_k.c',
'_M': 'generic/zhemv_k.c',
},
'z': {
'_U': 'generic/zhemv_k.c',
'_L': 'generic/zhemv_k.c',
'_V': 'generic/zhemv_k.c',
'_M': 'generic/zhemv_k.c',
},
},
# Level 3 BLAS
'?gemm': {
's': {
'_direct': 'x86_64/sgemm_direct_skylakex.c',
'_direct_performant': 'x86_64/sgemm_direct_performant.c',
'_small_matrix_permit': 'generic/gemm_small_matrix_permit.c',
},
'd': {
'_small_matrix_permit': 'generic/gemm_small_matrix_permit.c',
},
'c': {
'_small_matrix_permit': 'generic/zgemm_small_matrix_permit.c',
},
'z': {
'_small_matrix_permit': 'generic/zgemm_small_matrix_permit.c',
},
},
'?gemm3m': {
'c': {
'_oncopyb': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c',
'_oncopyi': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c',
'_oncopyr': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c',
'_otcopyb': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c',
'_otcopyr': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c',
'_otcopyi': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c',
'_incopyb': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c',
'_incopyr': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c',
'_incopyi': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c',
'_itcopyb': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c',
'_itcopyr': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c',
'_itcopyi': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c',
},
'z': {
'_oncopyb': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c',
'_oncopyi': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c',
'_oncopyr': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c',
'_otcopyb': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c',
'_otcopyi': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c',
'_otcopyr': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c',
'_incopyb': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c',
'_incopyi': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c',
'_incopyr': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c',
'_itcopyb': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c',
'_itcopyi': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c',
'_itcopyr': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c',
}
},
'?trmm': {
's': {
'_iunucopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_N@.c',
'_outucopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_N@.c',
'_outncopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_N@.c',
},
'd': {
'_iunucopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_N@.c',
'_outucopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_N@.c',
'_outncopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_N@.c',
},
'c': {
'_iunucopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_N@.c',
'_outucopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_N@.c',
'_outncopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_N@.c',
},
'z': {
'_iunucopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_N@.c',
'_outucopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_N@.c',
'_outncopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_N@.c',
},
},
'?hemm': {
'c': {
'_iutcopy': f'generic/zhemm_utcopy_@CGEMM_UNROLL_M@.c',
'_iltcopy': f'generic/zhemm_ltcopy_@CGEMM_UNROLL_M@.c',
'_outcopy': f'generic/zhemm_utcopy_@CGEMM_UNROLL_N@.c',
'_oltcopy': f'generic/zhemm_ltcopy_@CGEMM_UNROLL_N@.c',
},
'z': {
'_iutcopy': f'generic/zhemm_utcopy_@ZGEMM_UNROLL_M@.c',
'_iltcopy': f'generic/zhemm_ltcopy_@ZGEMM_UNROLL_M@.c',
'_outcopy': f'generic/zhemm_utcopy_@ZGEMM_UNROLL_N@.c',
'_oltcopy': f'generic/zhemm_ltcopy_@ZGEMM_UNROLL_N@.c',
},
},
'?trsm': {
's': {
'_iunucopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_N@.c',
'_outucopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_N@.c',
'_outncopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_N@.c',
},
'd': {
'_iunucopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_N@.c',
'_outucopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_N@.c',
'_outncopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_N@.c',
},
'c': {
'_iunucopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_N@.c',
'_outucopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_N@.c',
'_outncopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_N@.c',
},
'z': {
'_iunucopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_M@.c',
'_iunncopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_M@.c',
'_ilnucopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_M@.c',
'_ilnncopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_M@.c',
'_iutucopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_M@.c',
'_iutncopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_M@.c',
'_iltucopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_M@.c',
'_iltncopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_M@.c',
'_ounucopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_N@.c',
'_ounncopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_N@.c',
'_olnucopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_N@.c',
'_olnncopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_N@.c',
'_outucopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_N@.c',
'_outncopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_N@.c',
'_oltucopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_N@.c',
'_oltncopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_N@.c',
},
},
'?symm': {
's': {
'_iutcopy': f'generic/symm_ucopy_@SGEMM_UNROLL_M@.c',
'_iltcopy': f'generic/symm_lcopy_@SGEMM_UNROLL_M@.c',
'_outcopy': f'generic/symm_ucopy_@SGEMM_UNROLL_N@.c',
'_oltcopy': f'generic/symm_lcopy_@SGEMM_UNROLL_N@.c',
},
'd': {
'_iutcopy': f'generic/symm_ucopy_@DGEMM_UNROLL_M@.c',
'_iltcopy': f'generic/symm_lcopy_@DGEMM_UNROLL_M@.c',
'_outcopy': f'generic/symm_ucopy_@DGEMM_UNROLL_N@.c',
'_oltcopy': f'generic/symm_lcopy_@DGEMM_UNROLL_N@.c',
},
'c': {
'_iutcopy': f'generic/zsymm_ucopy_@CGEMM_UNROLL_M@.c',
'_iltcopy': f'generic/zsymm_lcopy_@CGEMM_UNROLL_M@.c',
'_outcopy': f'generic/zsymm_ucopy_@CGEMM_UNROLL_N@.c',
'_oltcopy': f'generic/zsymm_lcopy_@CGEMM_UNROLL_N@.c',
},
'z': {
'_iutcopy': f'generic/zsymm_ucopy_@ZGEMM_UNROLL_M@.c',
'_iltcopy': f'generic/zsymm_lcopy_@ZGEMM_UNROLL_M@.c',
'_outcopy': f'generic/zsymm_ucopy_@ZGEMM_UNROLL_N@.c',
'_oltcopy': f'generic/zsymm_lcopy_@ZGEMM_UNROLL_N@.c',
},
},
'?omatcopy_k': {
's': {
'_cn': 'arm/omatcopy_cn.c',
'_rn': 'arm/omatcopy_rn.c',
'_ct': 'arm/omatcopy_ct.c',
'_rt': 'arm/omatcopy_rt.c',
},
'd': {
'_cn': 'arm/omatcopy_cn.c',
'_rn': 'arm/omatcopy_rn.c',
'_ct': 'arm/omatcopy_ct.c',
'_rt': 'arm/omatcopy_rt.c',
},
'c': {
'_cn': 'arm/zomatcopy_cn.c',
'_rn': 'arm/zomatcopy_rn.c',
'_ct': 'arm/zomatcopy_ct.c',
'_rt': 'arm/zomatcopy_rt.c',
'_cnc': 'arm/zomatcopy_cnc.c',
'_rnc': 'arm/zomatcopy_rnc.c',
'_ctc': 'arm/zomatcopy_ctc.c',
'_rtc': 'arm/zomatcopy_rtc.c',
},
'z': {
'_cn': 'arm/zomatcopy_cn.c',
'_rn': 'arm/zomatcopy_rn.c',
'_ct': 'arm/zomatcopy_ct.c',
'_rt': 'arm/zomatcopy_rt.c',
'_cnc': 'arm/zomatcopy_cnc.c',
'_rnc': 'arm/zomatcopy_rnc.c',
'_ctc': 'arm/zomatcopy_ctc.c',
'_rtc': 'arm/zomatcopy_rtc.c',
},
},
'?imatcopy_k': {
's': {
'_cn': 'generic/imatcopy_cn.c',
'_rn': 'generic/imatcopy_rn.c',
'_ct': 'generic/imatcopy_ct.c',
'_rt': 'generic/imatcopy_rt.c',
},
'd': {
'_cn': 'generic/imatcopy_cn.c',
'_rn': 'generic/imatcopy_rn.c',
'_ct': 'generic/imatcopy_ct.c',
'_rt': 'generic/imatcopy_rt.c',
},
'c': {
'_cn': 'generic/zimatcopy_cn.c',
'_rn': 'generic/zimatcopy_rn.c',
'_ct': 'generic/zimatcopy_ct.c',
'_rt': 'generic/zimatcopy_rt.c',
'_cnc': 'generic/zimatcopy_cnc.c',
'_rnc': 'generic/zimatcopy_rnc.c',
'_ctc': 'generic/zimatcopy_ctc.c',
'_rtc': 'generic/zimatcopy_rtc.c',
},
'z': {
'_cn': 'generic/zimatcopy_cn.c',
'_rn': 'generic/zimatcopy_rn.c',
'_ct': 'generic/zimatcopy_ct.c',
'_rt': 'generic/zimatcopy_rt.c',
'_cnc': 'generic/zimatcopy_cnc.c',
'_rnc': 'generic/zimatcopy_rnc.c',
'_ctc': 'generic/zimatcopy_ctc.c',
'_rtc': 'generic/zimatcopy_rtc.c',
},
},
'?geadd': {
's': {
'_k': 'generic/geadd.c',
},
'd': {
'_k': 'generic/geadd.c',
},
'c': {
'_k': 'generic/zgeadd.c',
},
'z': {
'_k': 'generic/zgeadd.c',
},
},
'?gemm_small_kernel': {
's': {
'_nn': 'generic/gemm_small_matrix_kernel_nn.c',
'_nt': 'generic/gemm_small_matrix_kernel_nt.c',
'_tn': 'generic/gemm_small_matrix_kernel_tn.c',
'_tt': 'generic/gemm_small_matrix_kernel_tt.c',
},
'd': {
'_nn': 'generic/gemm_small_matrix_kernel_nn.c',
'_nt': 'generic/gemm_small_matrix_kernel_nt.c',
'_tn': 'generic/gemm_small_matrix_kernel_tn.c',
'_tt': 'generic/gemm_small_matrix_kernel_tt.c',
},
'c': {
'_nn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_nc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_tn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tt': 'generic/zgemm_small_matrix_kernel_tt.c',
'_tc': 'generic/zgemm_small_matrix_kernel_tt.c',
'_ct': 'generic/zgemm_small_matrix_kernel_tt.c',
'_cc': 'generic/zgemm_small_matrix_kernel_tt.c',
},
'z': {
'_nn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_nc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_tn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tt': 'generic/zgemm_small_matrix_kernel_tt.c',
'_tc': 'generic/zgemm_small_matrix_kernel_tt.c',
'_ct': 'generic/zgemm_small_matrix_kernel_tt.c',
'_cc': 'generic/zgemm_small_matrix_kernel_tt.c',
},
},
'?gemm_small_kernel_b0': {
's': {
'_nn': 'generic/gemm_small_matrix_kernel_nn.c',
'_nt': 'generic/gemm_small_matrix_kernel_nt.c',
'_tn': 'generic/gemm_small_matrix_kernel_tn.c',
'_tt': 'generic/gemm_small_matrix_kernel_tt.c',
},
'd': {
'_nn': 'generic/gemm_small_matrix_kernel_nn.c',
'_nt': 'generic/gemm_small_matrix_kernel_nt.c',
'_tn': 'generic/gemm_small_matrix_kernel_tn.c',
'_tt': 'generic/gemm_small_matrix_kernel_tt.c',
},
'c': {
'_nn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_nc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_tn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tt': 'generic/zgemm_small_matrix_kernel_tt.c',
'_tc': 'generic/zgemm_small_matrix_kernel_tt.c',
'_ct': 'generic/zgemm_small_matrix_kernel_tt.c',
'_cc': 'generic/zgemm_small_matrix_kernel_tt.c',
},
'z': {
'_nn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rn': 'generic/zgemm_small_matrix_kernel_nn.c',
'_rr': 'generic/zgemm_small_matrix_kernel_nn.c',
'_nt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_nc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rt': 'generic/zgemm_small_matrix_kernel_nt.c',
'_rc': 'generic/zgemm_small_matrix_kernel_nt.c',
'_tn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cn': 'generic/zgemm_small_matrix_kernel_tn.c',
'_cr': 'generic/zgemm_small_matrix_kernel_tn.c',
'_tt': 'generic/zgemm_small_matrix_kernel_tt.c',
'_tc': 'generic/zgemm_small_matrix_kernel_tt.c',
'_ct': 'generic/zgemm_small_matrix_kernel_tt.c',
'_cc': 'generic/zgemm_small_matrix_kernel_tt.c',
},
},
}

View File

@ -0,0 +1,385 @@
x86_64_base_dict = {
# Level 1 BLAS
'?rot': {
's': {
'_k': 'x86_64/rot_sse.S',
},
'd': {
'_k': 'x86_64/rot_sse2.S',
},
'cs': {
'_k': 'x86_64/zrot_sse.S',
},
'zd': {
'_k': 'x86_64/zrot_sse2.S',
}
},
'?swap': {
's': {
'_k': 'x86_64/swap_sse.S',
},
'd': {
'_k': 'x86_64/swap_sse2.S',
},
'c': {
'_k': 'x86_64/zswap_sse.S',
},
'z': {
'_k': 'x86_64/zswap_sse2.S',
},
},
'?scal': {
's': {
'_k': 'x86_64/scal_sse.S',
},
'd': {
'_k': 'x86_64/scal_sse2.S',
},
'c': {
'_k': 'x86_64/zscal_sse.S',
},
'z': {
'_k': 'x86_64/zscal_sse2.S',
},
},
'?copy': {
's': {
'_k': 'x86_64/copy_sse.S',
},
'd': {
'_k': 'x86_64/copy_sse2.S',
},
'c': {
'_k': 'x86_64/zcopy_sse.S',
},
'z': {
'_k': 'x86_64/zcopy_sse2.S',
},
},
'?axpy': {
's': {
'_k': 'x86_64/axpy_sse.S',
},
'd': {
'_k': 'x86_64/axpy_sse2.S',
},
'c': {
'_k': 'x86_64/zaxpy_sse.S',
},
'z': {
'_k': 'x86_64/zaxpy_sse2.S',
},
},
'?axpyc': {
'c': {
'_k': 'x86_64/zaxpy_sse.S',
},
'z': {
'_k': 'x86_64/zaxpy_sse2.S',
},
},
'?dot': {
's': {
'_k': 'generic/dot.c',
},
'd': {
'_k': 'x86_64/dot_sse2.S',
},
},
'?dotc': {
'c': {
'_k': 'x86_64/zdot_sse.S',
},
'z': {
'_k': 'x86_64/zdot_sse2.S',
},
},
'?dotu': {
'c': {
'_k': 'x86_64/zdot_sse.S',
},
'z': {
'_k': 'x86_64/zdot_sse2.S',
},
},
'?dsdot': {
's': {
'_k': 'generic/dot.c',
},
'': {
'_k': 'generic/dot.c',
},
},
'?nrm2': {
's': {
'_k': 'x86_64/nrm2_sse.S',
},
'd': {
'_k': 'x86_64/nrm2.S',
},
'c': {
'_k': 'x86_64/znrm2_sse.S',
},
'z': {
'_k': 'x86_64/znrm2.S',
},
},
'?asum': {
's': {
'_k': 'x86_64/asum_sse.S',
},
'd': {
'_k': 'x86_64/asum_sse2.S',
},
'c': {
'_k': 'x86_64/zasum_sse.S',
},
'z': {
'_k': 'x86_64/zasum_sse2.S',
},
},
'?amax': {
's': {
'_k': 'x86_64/amax_sse.S',
},
'd': {
'_k': 'x86_64/amax_sse2.S',
},
'c': {
'_k': 'x86_64/zamax_sse.S',
},
'z': {
'_k': 'x86_64/zamax_sse2.S',
},
},
'?sum': {
's': {
'_k': 'arm/sum.c',
},
'd': {
'_k': 'arm/sum.c',
},
'c': {
'_k': 'x86_64/zsum_sse.S',
},
'z': {
'_k': 'x86_64/zsum_sse2.S',
},
},
'?amin': {
's': {
'_k': 'x86_64/amax_sse.S',
},
'd': {
'_k': 'x86_64/amax_sse2.S',
},
'c': {
'_k': 'x86_64/zamax_sse.S',
},
'z': {
'_k': 'x86_64/zamax_sse2.S',
},
},
'i?amax': {
's': {
'_k': 'x86_64/iamax_sse.S',
},
'd': {
'_k': 'x86_64/iamax_sse2.S',
},
'c': {
'_k': 'x86_64/izamax_sse.S',
},
'z': {
'_k': 'x86_64/izamax_sse2.S',
},
},
'i?amin': {
's': {
'_k': 'x86_64/iamax_sse.S',
},
'd': {
'_k': 'x86_64/iamax_sse2.S',
},
'c': {
'_k': 'x86_64/izamax_sse.S',
},
'z': {
'_k': 'x86_64/izamax_sse2.S',
},
},
'i?max': {
's': {
'_k': 'x86_64/iamax_sse.S',
},
'd': {
'_k': 'x86_64/iamax_sse2.S',
},
},
'i?min': {
's': {
'_k': 'x86_64/iamax_sse.S',
},
'd': {
'_k': 'x86_64/iamax_sse2.S',
},
},
'?max': {
's': {
'_k': 'x86_64/amax_sse.S',
},
'd': {
'_k': 'x86_64/amax_sse2.S',
},
},
'?min': {
's': {
'_k': 'x86_64/amax_sse.S',
},
'd': {
'_k': 'x86_64/amax_sse2.S',
},
},
'?axpby': {
's': {
'_k': 'arm/axpby.c',
},
'd': {
'_k': 'arm/axpby.c',
},
'c': {
'_k': 'arm/zaxpby.c',
},
'z': {
'_k': 'arm/zaxpby.c',
},
},
# Level 2 BLAS
'?gemv': {
's': {
'_n': 'x86_64/sgemv_n.c',
'_t': 'x86_64/sgemv_t.c',
},
'd': {
'_n': 'x86_64/dgemv_n.S',
'_t': 'x86_64/dgemv_t_4.c',
},
'c': {
'_n': 'x86_64/cgemv_n_4.c',
'_t': 'x86_64/cgemv_t_4.c',
'_r': 'x86_64/cgemv_n_4.c',
'_c': 'x86_64/cgemv_t_4.c',
'_o': 'x86_64/cgemv_n_4.c',
'_u': 'x86_64/cgemv_t_4.c',
'_s': 'x86_64/cgemv_n_4.c',
'_d': 'x86_64/cgemv_t_4.c',
},
'z': {
'_n': 'x86_64/zgemv_n_4.c',
'_t': 'x86_64/zgemv_t_4.c',
'_r': 'x86_64/zgemv_n_4.c',
'_c': 'x86_64/zgemv_t_4.c',
'_o': 'x86_64/zgemv_n_4.c',
'_u': 'x86_64/zgemv_t_4.c',
'_s': 'x86_64/zgemv_n_4.c',
'_d': 'x86_64/zgemv_t_4.c',
},
},
'?symv': {
's': {
'_U': 'x86_64/symv_U_sse.S',
'_L': 'x86_64/symv_L_sse.S',
},
'd': {
'_U': 'x86_64/symv_U_sse2.S',
'_L': 'x86_64/symv_L_sse2.S',
},
'c': {
'_U': 'generic/zsymv_k.c',
'_L': 'generic/zsymv_k.c',
},
'z': {
'_U': 'x86_64/zsymv_U_sse2.S',
'_L': 'x86_64/zsymv_L_sse2.S',
},
},
'?lsame': {
'': {
'': 'x86_64/lsame.S',
}
},
'?cabs': {
's': {
'1': 'x86_64/cabs.S',
},
'd': {
'1': 'x86_64/cabs.S',
},
},
'?gemm3m': {
},
'?hemv': {
'z': {
'_U': 'x86_64/zsymv_U_sse2.S',
'_L': 'x86_64/zsymv_L_sse2.S',
},
},
# Level 3 BLAS
'?gemm_kernel': {
# done
},
'?trmm_kernel': {
# done
},
'?trsm_kernel': {
# done
},
'?gemm': {
's': {
'_beta': 'x86_64/gemm_beta.S',
'_small_matrix_permit': 'generic/gemm_small_matrix_permit.c',
},
'd': {
'_beta': 'x86_64/gemm_beta.S',
},
'c': {
'_beta': 'x86_64/zgemm_beta.S',
},
'z': {
'_beta': 'x86_64/zgemm_beta.S',
},
},
'?trmm': {
},
'?hemm': {
},
'?trsm': {
},
'?symm': {
},
'?omatcopy_k': {
},
'?imatcopy_k': {
},
'?geadd': {
},
'?gemm_small_kernel': {
},
'?gemm_small_kernel_b0': {
},
}
subdir('meson_haswell')
subdir('meson_skylakex')
subdir('meson_zen')
subdir('meson_sandybridge')

View File

@ -0,0 +1,238 @@
x86_64_haswell_dict = {
'?scal': {
's': {
'_k': 'x86_64/sscal.c',
},
'd': {
'_k': 'x86_64/dscal.c',
},
'c': {
'_k': 'x86_64/cscal.c',
},
'z': {
'_k': 'x86_64/zscal.c',
},
},
'?gemv': {
's': {
'_n': 'x86_64/sgemv_n_4.c',
'_t': 'x86_64/sgemv_t_4.c',
},
'd': {
'_n': 'x86_64/dgemv_n_4.c',
'_t': 'x86_64/dgemv_t_4.c',
},
'c': {
'_n': 'x86_64/cgemv_n_4.c',
'_t': 'x86_64/cgemv_t_4.c',
},
'z': {
'_n': 'x86_64/zgemv_n_4.c',
'_t': 'x86_64/zgemv_t_4.c',
},
},
'?symv': {
's': {
'_U': 'x86_64/ssymv_U.c',
'_L': 'x86_64/ssymv_L.c',
},
'd': {
'_U': 'x86_64/dsymv_U.c',
'_L': 'x86_64/dsymv_L.c',
},
},
'?dot': {
's': {
'_k': 'x86_64/sdot.c',
},
'd': {
'_k': 'x86_64/ddot.c',
},
},
'?dotc': {
'c': {
'_k': 'x86_64/cdot.c',
},
'z': {
'_k': 'x86_64/zdot.c',
},
},
'?dotu': {
'c': {
'_k': 'x86_64/cdot.c',
},
'z': {
'_k': 'x86_64/zdot.c',
},
},
'?dsdot': {
'': {
'_k': 'x86_64/sdot.c',
},
's': {
'_k': 'x86_64/sdot.c',
},
},
'?axpy': {
's': {
'_k': 'x86_64/saxpy.c',
},
'd': {
'_k': 'x86_64/daxpy.c',
},
'c': {
'_k': 'x86_64/caxpy.c',
},
'z': {
'_k': 'x86_64/zaxpy.c',
},
},
'?axpyc': {
'c': {
'_k': 'x86_64/caxpy.c',
},
'z': {
'_k': 'x86_64/zaxpy.c',
},
},
'?gemm_kernel': {
's': {
'': 'x86_64/sgemm_kernel_8x4_haswell_2.c',
},
'd': {
'': 'x86_64/dgemm_kernel_4x8_haswell.S',
},
'c': {
'_n': 'x86_64/cgemm_kernel_8x2_haswell.c',
'_l': 'x86_64/cgemm_kernel_8x2_haswell.c',
'_r': 'x86_64/cgemm_kernel_8x2_haswell.c',
'_b': 'x86_64/cgemm_kernel_8x2_haswell.c',
},
'z': {
'_n': 'x86_64/zgemm_kernel_4x2_haswell.c',
'_l': 'x86_64/zgemm_kernel_4x2_haswell.c',
'_r': 'x86_64/zgemm_kernel_4x2_haswell.c',
'_b': 'x86_64/zgemm_kernel_4x2_haswell.c',
},
},
'?trmm_kernel': {
's': {
'_LN': 'x86_64/sgemm_kernel_8x4_haswell.c',
'_LT': 'x86_64/sgemm_kernel_8x4_haswell.c',
'_RN': 'x86_64/sgemm_kernel_8x4_haswell.c',
'_RT': 'x86_64/sgemm_kernel_8x4_haswell.c',
},
'd': {
'_LN': 'x86_64/dtrmm_kernel_4x8_haswell.c',
'_LT': 'x86_64/dtrmm_kernel_4x8_haswell.c',
'_RN': 'x86_64/dtrmm_kernel_4x8_haswell.c',
'_RT': 'x86_64/dtrmm_kernel_4x8_haswell.c',
},
'c': {
'_LN': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_LT': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_LR': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_LC': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RN': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RT': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RR': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RC': 'x86_64/cgemm_kernel_8x2_haswell.S',
},
'z': {
'_LN': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_LT': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_LR': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_LC': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RN': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RT': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RR': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RC': 'x86_64/zgemm_kernel_4x2_haswell.S',
},
},
'?trsm_kernel': {
's': {
'_LN': 'x86_64/strsm_kernel_8x4_haswell_LN.c',
'_LT': 'x86_64/strsm_kernel_8x4_haswell_LT.c',
'_RN': 'x86_64/strsm_kernel_8x4_haswell_RN.c',
'_RT': 'x86_64/strsm_kernel_8x4_haswell_RT.c',
},
'd': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_RN': 'x86_64/dtrsm_kernel_RN_haswell.c',
'_RT': 'generic/trsm_kernel_RT.c',
},
'c': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
'z': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
},
'?gemm': {
's': {
'_beta': 'x86_64/sgemm_beta_skylakex.c',
'_incopy': 'generic/gemm_ncopy_8.c',
'_itcopy': 'generic/gemm_tcopy_8.c',
'_oncopy': 'x86_64/sgemm_ncopy_4_skylakex.c',
'_otcopy': 'generic/gemm_tcopy_4.c'
},
'd': {
'_beta': 'x86_64/dgemm_beta_skylakex.c',
'_incopy': 'generic/gemm_ncopy_4.c',
'_itcopy': 'generic/gemm_tcopy_4.c',
'_oncopy': 'x86_64/dgemm_ncopy_8_skylakex.c',
'_otcopy': 'generic/gemm_tcopy_8.c',
},
'c': {
'_incopy': 'generic/zgemm_ncopy_8.c',
'_itcopy': 'generic/zgemm_tcopy_8.c',
'_oncopy': 'generic/zgemm_ncopy_2.c',
'_otcopy': 'generic/zgemm_tcopy_2.c'
},
'z': {
'_incopy': 'generic/zgemm_ncopy_4.c',
'_itcopy': 'generic/zgemm_tcopy_4.c',
'_oncopy': 'generic/zgemm_ncopy_2.c',
'_otcopy': 'generic/zgemm_tcopy_2.c'
},
},
'?gemm3m': {
'c': {
'_kernel': 'x86_64/cgemm3m_kernel_8x4_haswell.c',
},
'z': {
'_kernel': 'x86_64/zgemm3m_kernel_4x4_haswell.c',
},
},
'?asum': {
's': {
'_k': 'x86_64/sasum.c',
},
'd': {
'_k': 'x86_64/dasum.c',
},
},
'?rot': {
's': {
'_k': 'x86_64/srot.c',
},
'd': {
'_k': 'x86_64/drot.c',
},
},
}

View File

@ -0,0 +1,213 @@
x86_64_sandybridge_dict = {
'?scal': {
'd': {
'_k': 'x86_64/dscal.c',
},
'c': {
'_k': 'x86_64/cscal.c',
},
},
'?ger': {
's': {
'_k': 'x86_64/sger.c',
},
'd': {
'_k': 'x86_64/dger.c',
},
},
'?gemv': {
's': {
'_n': 'x86_64/sgemv_n_4.c',
'_t': 'x86_64/sgemv_t_4.c',
},
'z': {
'_n': 'x86_64/zgemv_n_4.c',
},
},
'?symv': {
's': {
'_U': 'x86_64/ssymv_U.c',
'_L': 'x86_64/ssymv_L.c',
},
'd': {
'_U': 'x86_64/dsymv_U.c',
'_L': 'x86_64/dsymv_L.c',
},
},
'?dot': {
's': {
'_k': 'x86_64/sdot.c',
},
'd': {
'_k': 'x86_64/ddot.c',
},
},
'?dotc': {
'c': {
'_k': 'x86_64/cdot.c',
},
'z': {
'_k': 'x86_64/zdot.c',
},
},
'?dotu': {
'c': {
'_k': 'x86_64/cdot.c',
},
'z': {
'_k': 'x86_64/zdot.c',
},
},
'?dsdot': {
'': {
'_k': 'x86_64/sdot.c',
},
's': {
'_k': 'x86_64/sdot.c',
},
},
'?axpy': {
's': {
'_k': 'x86_64/saxpy.c',
},
'd': {
'_k': 'x86_64/daxpy.c',
},
'c': {
'_k': 'x86_64/caxpy.c',
},
'z': {
'_k': 'x86_64/zaxpy.c',
},
},
'?axpyc': {
'c': {
'_k': 'x86_64/caxpy.c',
},
'z': {
'_k': 'x86_64/zaxpy.c',
},
},
'?gemm_kernel': {
's': {
'': 'x86_64/sgemm_kernel_16x4_sandy.S',
},
'd': {
'': 'x86_64/dgemm_kernel_4x8_sandy.S',
},
'c': {
'_n': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_l': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_r': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_b': 'x86_64/cgemm_kernel_8x2_sandy.S',
},
'z': {
'_n': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_l': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_r': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_b': 'x86_64/zgemm_kernel_1x4_nehalem.S',
},
},
'?trmm_kernel': {
's': {
'_LN': 'x86_64/sgemm_kernel_16x4_sandy.S',
'_LT': 'x86_64/sgemm_kernel_16x4_sandy.S',
'_RN': 'x86_64/sgemm_kernel_16x4_sandy.S',
'_RT': 'x86_64/sgemm_kernel_16x4_sandy.S',
},
'd': {
'_LN': 'x86_64/dgemm_kernel_4x8_sandy.S',
'_LT': 'x86_64/dgemm_kernel_4x8_sandy.S',
'_RN': 'x86_64/dgemm_kernel_4x8_sandy.S',
'_RT': 'x86_64/dgemm_kernel_4x8_sandy.S',
},
'c': {
'_LN': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_LT': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_LR': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_LC': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_RN': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_RT': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_RR': 'x86_64/cgemm_kernel_8x2_sandy.S',
'_RC': 'x86_64/cgemm_kernel_8x2_sandy.S',
},
'z': {
'_LN': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_LT': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_LR': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_LC': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_RN': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_RT': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_RR': 'x86_64/zgemm_kernel_1x4_nehalem.S',
'_RC': 'x86_64/zgemm_kernel_1x4_nehalem.S',
},
},
'?trsm_kernel': {
's': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
},
'd': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
},
'c': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
'z': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
},
'?gemm': {
's': {
'_incopy': 'generic/gemm_ncopy_16.c',
'_itcopy': 'generic/gemm_tcopy_16.c',
'_oncopy': 'generic/gemm_ncopy_4.c',
'_otcopy': 'generic/gemm_tcopy_4.c'
},
'd': {
'_incopy': 'generic/gemm_ncopy_8.c',
'_itcopy': 'generic/gemm_tcopy_8.c',
'_oncopy': 'generic/gemm_ncopy_4.c',
'_otcopy': 'generic/gemm_tcopy_4.c',
},
'c': {
'_incopy': 'generic/zgemm_ncopy_8.c',
'_itcopy': 'generic/zgemm_tcopy_8.c',
'_oncopy': 'generic/zgemm_ncopy_2.c',
'_otcopy': 'generic/zgemm_tcopy_2.c'
},
'z': {
'_incopy': 'x86_64/zgemm_ncopy_1.S',
'_itcopy': 'x86_64/zgemm_tcopy_1.S',
'_oncopy': 'generic/zgemm_ncopy_4.c',
'_otcopy': 'generic/zgemm_tcopy_4.c'
},
},
'?gemm3m': {
'c': {
'_kernel': 'x86_64/zgemm3m_kernel_4x8_nehalem.S',
},
'z': {
'_kernel': 'x86_64/zgemm3m_kernel_2x8_nehalem.S',
},
},
}

View File

@ -0,0 +1,109 @@
x86_64_skylakex_dict = {
'?gemm_kernel': {
's': {
'': 'x86_64/sgemm_kernel_16x4_skylakex_3.c',
},
'd': {
'': 'x86_64/dgemm_kernel_16x2_skylakex.c',
},
'c': {
'_n': 'x86_64/cgemm_kernel_8x2_skylakex.c',
'_l': 'x86_64/cgemm_kernel_8x2_skylakex.c',
'_r': 'x86_64/cgemm_kernel_8x2_skylakex.c',
'_b': 'x86_64/cgemm_kernel_8x2_skylakex.c',
},
'z': {
'_n': 'x86_64/zgemm_kernel_4x2_skylakex.c',
'_l': 'x86_64/zgemm_kernel_4x2_skylakex.c',
'_r': 'x86_64/zgemm_kernel_4x2_skylakex.c',
'_b': 'x86_64/zgemm_kernel_4x2_skylakex.c',
},
},
'?trmm_kernel': {
's': {
'_LN': 'x86_64/sgemm_kernel_16x4_skylakex_2.c',
'_LT': 'x86_64/sgemm_kernel_16x4_skylakex_2.c',
'_RN': 'x86_64/sgemm_kernel_16x4_skylakex_2.c',
'_RT': 'x86_64/sgemm_kernel_16x4_skylakex_2.c',
},
'd': {
'_LN': 'x86_64/dgemm_kernel_16x2_skylakex.c',
'_LT': 'x86_64/dgemm_kernel_16x2_skylakex.c',
'_RN': 'x86_64/dgemm_kernel_16x2_skylakex.c',
'_RT': 'x86_64/dgemm_kernel_16x2_skylakex.c',
},
},
'?trsm_kernel': {
's': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
},
'd': {
'_RN': 'generic/trsm_kernel_RN.c',
},
},
'?gemm_small_kernel': {
's': {
'_nn': 'x86_64/sgemm_small_kernel_nn_skylakex.c',
'_nt': 'x86_64/sgemm_small_kernel_nt_skylakex.c',
'_tn': 'x86_64/sgemm_small_kernel_tn_skylakex.c',
'_tt': 'x86_64/sgemm_small_kernel_tt_skylakex.c',
},
'd': {
'_nn': 'x86_64/dgemm_small_kernel_nn_skylakex.c',
'_nt': 'x86_64/dgemm_small_kernel_nt_skylakex.c',
'_tn': 'x86_64/dgemm_small_kernel_tn_skylakex.c',
'_tt': 'x86_64/dgemm_small_kernel_tt_skylakex.c',
},
},
'?gemm_small_kernel_b0': {
's': {
'_nn': 'x86_64/sgemm_small_kernel_nn_skylakex.c',
'_nt': 'x86_64/sgemm_small_kernel_nt_skylakex.c',
'_tn': 'x86_64/sgemm_small_kernel_tn_skylakex.c',
'_tt': 'x86_64/sgemm_small_kernel_tt_skylakex.c',
},
'd': {
'_nn': 'x86_64/dgemm_small_kernel_nn_skylakex.c',
'_nt': 'x86_64/dgemm_small_kernel_nt_skylakex.c',
'_tn': 'x86_64/dgemm_small_kernel_tn_skylakex.c',
'_tt': 'x86_64/dgemm_small_kernel_tt_skylakex.c',
},
},
'?gemm': {
's': {
'_small_matrix_permit': 'x86_64/sgemm_small_kernel_permit_skylakex.c',
'_beta': 'x86_64/sgemm_beta_skylakex.c',
'_incopy': 'generic/gemm_ncopy_16.c',
'_itcopy': 'x86_64/sgemm_tcopy_16_skylakex.c',
'_oncopy': 'x86_64/sgemm_ncopy_4_skylakex.c',
'_otcopy': 'generic/gemm_tcopy_4.c'
},
'd': {
'_small_matrix_permit': 'x86_64/dgemm_small_kernel_permit_skylakex.c',
'_beta': 'x86_64/dgemm_beta_skylakex.c',
'_incopy': 'generic/gemm_ncopy_16.c',
'_itcopy': 'x86_64/dgemm_tcopy_16_skylakex.c',
'_oncopy': 'generic/gemm_ncopy_2.c',
'_otcopy': 'generic/gemm_tcopy_2.c',
},
},
'?asum': {
'c': {
'_k': 'x86_64/casum.c',
},
'z': {
'_k': 'x86_64/zasum.c',
},
},
'?sum': {
'c': {
'_k': 'x86_64/csum.c',
},
'z': {
'_k': 'x86_64/zsum.c',
},
},
}

View File

@ -0,0 +1,228 @@
x86_64_zen_dict = {
'?scal': {
's': {
'_k': 'x86_64/sscal.c',
},
'd': {
'_k': 'x86_64/dscal.c',
},
'c': {
'_k': 'x86_64/cscal.c',
},
'z': {
'_k': 'x86_64/zscal.c',
},
},
'?gemv': {
's': {
'_n': 'x86_64/sgemv_n_4.c',
'_t': 'x86_64/sgemv_t_4.c',
},
'd': {
'_n': 'x86_64/dgemv_n_4.c',
'_t': 'x86_64/dgemv_t_4.c',
},
'c': {
'_n': 'x86_64/cgemv_n_4.c',
'_t': 'x86_64/cgemv_t_4.c',
},
'z': {
'_n': 'x86_64/zgemv_n_4.c',
'_t': 'x86_64/zgemv_t_4.c',
},
},
'?symv': {
's': {
'_U': 'x86_64/ssymv_U.c',
'_L': 'x86_64/ssymv_L.c',
},
'd': {
'_U': 'x86_64/dsymv_U.c',
'_L': 'x86_64/dsymv_L.c',
},
},
'?dot': {
's': {
'_k': 'x86_64/sdot.c',
},
'd': {
'_k': 'x86_64/ddot.c',
},
},
'?dotc': {
'c': {
'_k': 'x86_64/cdot.c',
},
'z': {
'_k': 'x86_64/zdot.c',
},
},
'?dotu': {
'c': {
'_k': 'x86_64/cdot.c',
},
'z': {
'_k': 'x86_64/zdot.c',
},
},
'?dsdot': {
'': {
'_k': 'x86_64/sdot.c',
},
's': {
'_k': 'x86_64/sdot.c',
},
},
'?axpy': {
's': {
'_k': 'x86_64/saxpy.c',
},
'd': {
'_k': 'x86_64/daxpy.c',
},
'c': {
'_k': 'x86_64/caxpy.c',
},
'z': {
'_k': 'x86_64/zaxpy.c',
},
},
'?axpyc': {
'c': {
'_k': 'x86_64/caxpy.c',
},
'z': {
'_k': 'x86_64/zaxpy.c',
},
},
'?gemm_kernel': {
's': {
'': 'x86_64/sgemm_kernel_8x4_haswell_2.c',
},
'd': {
'': 'x86_64/dgemm_kernel_4x8_haswell.S',
},
'c': {
'_n': 'x86_64/cgemm_kernel_8x2_haswell.c',
'_l': 'x86_64/cgemm_kernel_8x2_haswell.c',
'_r': 'x86_64/cgemm_kernel_8x2_haswell.c',
'_b': 'x86_64/cgemm_kernel_8x2_haswell.c',
},
'z': {
'_n': 'x86_64/zgemm_kernel_4x2_haswell.c',
'_l': 'x86_64/zgemm_kernel_4x2_haswell.c',
'_r': 'x86_64/zgemm_kernel_4x2_haswell.c',
'_b': 'x86_64/zgemm_kernel_4x2_haswell.c',
},
},
'?trmm_kernel': {
's': {
'_LN': 'x86_64/sgemm_kernel_8x4_haswell.c',
'_LT': 'x86_64/sgemm_kernel_8x4_haswell.c',
'_RN': 'x86_64/sgemm_kernel_8x4_haswell.c',
'_RT': 'x86_64/sgemm_kernel_8x4_haswell.c',
},
'd': {
'_LN': 'x86_64/dtrmm_kernel_4x8_haswell.c',
'_LT': 'x86_64/dtrmm_kernel_4x8_haswell.c',
'_RN': 'x86_64/dtrmm_kernel_4x8_haswell.c',
'_RT': 'x86_64/dtrmm_kernel_4x8_haswell.c',
},
'c': {
'_LN': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_LT': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_LR': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_LC': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RN': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RT': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RR': 'x86_64/cgemm_kernel_8x2_haswell.S',
'_RC': 'x86_64/cgemm_kernel_8x2_haswell.S',
},
'z': {
'_LN': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_LT': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_LR': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_LC': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RN': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RT': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RR': 'x86_64/zgemm_kernel_4x2_haswell.S',
'_RC': 'x86_64/zgemm_kernel_4x2_haswell.S',
},
},
'?trsm_kernel': {
's': {
'_LN': 'x86_64/strsm_kernel_8x4_haswell_LN.c',
'_LT': 'x86_64/strsm_kernel_8x4_haswell_LT.c',
'_RN': 'x86_64/strsm_kernel_8x4_haswell_RN.c',
'_RT': 'x86_64/strsm_kernel_8x4_haswell_RT.c',
},
'd': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_RN': 'x86_64/dtrsm_kernel_RN_haswell.c',
'_RT': 'generic/trsm_kernel_RT.c',
},
'c': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
'z': {
'_LN': 'generic/trsm_kernel_LN.c',
'_LT': 'generic/trsm_kernel_LT.c',
'_LR': 'generic/trsm_kernel_LN.c',
'_LC': 'generic/trsm_kernel_LT.c',
'_RN': 'generic/trsm_kernel_RN.c',
'_RT': 'generic/trsm_kernel_RT.c',
'_RR': 'generic/trsm_kernel_RN.c',
'_RC': 'generic/trsm_kernel_RT.c',
},
},
'?gemm': {
's': {
'_incopy': 'generic/gemm_ncopy_8.c',
'_itcopy': 'generic/gemm_tcopy_8.c',
'_oncopy': 'generic/gemm_ncopy_4.c',
'_otcopy': 'generic/gemm_tcopy_4.c'
},
'd': {
'_incopy': 'generic/gemm_ncopy_4.c',
'_itcopy': 'generic/gemm_tcopy_4.c',
'_oncopy': 'generic/gemm_ncopy_8.c',
'_otcopy': 'generic/gemm_tcopy_8.c',
},
'c': {
'_incopy': 'generic/zgemm_ncopy_8.c',
'_itcopy': 'generic/zgemm_tcopy_8.c',
'_oncopy': 'generic/zgemm_ncopy_2.c',
'_otcopy': 'generic/zgemm_tcopy_2.c'
},
'z': {
'_incopy': 'generic/zgemm_ncopy_4.c',
'_itcopy': 'generic/zgemm_tcopy_4.c',
'_oncopy': 'generic/zgemm_ncopy_2.c',
'_otcopy': 'generic/zgemm_tcopy_2.c'
},
},
'?gemm3m': {
'c': {
'_kernel': 'x86_64/cgemm3m_kernel_8x4_haswell.c',
},
'z': {
'_kernel': 'x86_64/zgemm3m_kernel_4x4_haswell.c',
},
},
'?rot': {
's': {
'_k': 'x86_64/srot.c',
},
'd': {
'_k': 'x86_64/drot.c',
},
},
}

View File

@ -463,16 +463,23 @@ symb_defs = {
# config.h file generation
_config_h = meson.current_build_dir() / 'config.h'
run_command('./c_check', 'Makefile.conf', _config_h, cc_id, check: true)
run_command('./f_check', 'Makefile.conf', _config_h, fc_id, check: true)
_makefile_conf = meson.current_build_dir() / 'Makefile.conf'
run_command('./c_check', _makefile_conf, _config_h, cc_id, check: true)
run_command('./f_check', _makefile_conf, _config_h, fc_id, check: true)
run_command(cc_id, '-o', 'getarch', 'getarch.c', 'cpuid.S', check: true)
_getarch_result = run_command('./getarch', '1', check: true, capture: true)
_getarch_1_result = run_command('./getarch', '1', check: true, capture: true)
run_command(py3,
'./write_to_file.py',
_getarch_result.stdout(),
_getarch_1_result.stdout(),
_config_h,
check: true)
_getarch_0_result = run_command('./getarch', '0', check: true, capture: true)
run_command(py3,
'./write_to_file.py',
_getarch_0_result.stdout(),
_makefile_conf,
check: true)
run_command(cc_id,
'-DGEMM_MULTITHREAD_THRESHOLD=4',
@ -481,12 +488,44 @@ run_command(cc_id,
'-o', 'getarch_2nd',
'getarch_2nd.c',
capture: true, check: true)
_getarch_2nd_result = run_command('./getarch_2nd', '1', check: true, capture: true)
_getarch_2nd_1_result = run_command('./getarch_2nd', '1', check: true, capture: true)
run_command(py3,
'./write_to_file.py',
_getarch_2nd_result.stdout(),
_getarch_2nd_1_result.stdout(),
_config_h,
check: true)
_getarch_2nd_0_result = run_command('./getarch_2nd', '0', check: true, capture: true)
run_command(py3,
'./write_to_file.py',
_getarch_2nd_0_result.stdout(),
_makefile_conf,
check: true)
_read_config_py = './read_config.py'
run_command(py3,
_read_config_py,
_config_h,
check: true)
keyval = import('keyval')
conf_kv = keyval.load(meson.current_build_dir() / 'config.kconf')
# NOTE(rg): conf_kv doesn't do any parsing, setup manually
conf_hdat = configuration_data()
foreach key,val : conf_kv
if 'CHAR' in key
conf_hdat.set_quoted(key, val)
else
conf_hdat.set(key, val)
endif
endforeach
makefile_conf_kv = keyval.load(meson.current_build_dir() / 'Makefile.conf')
makefile_conf_dat = configuration_data()
foreach key,val : makefile_conf_kv
makefile_conf_dat.set(key, val)
endforeach
# Ignoring other hostarch checks and conflicts for arch in BSD for now
_inc = [include_directories('.')]

39
read_config.py Normal file
View File

@ -0,0 +1,39 @@
#!/usr/bin/env python3
import argparse
def read_config_file(file_path: str) -> dict:
config_data = {}
with open(file_path, "r") as file:
lines = file.readlines()
for line in lines:
line = line.strip()
if line.startswith("#define"):
parts = line.split()
key = parts[1]
if len(parts) == 3:
value = parts[2]
if value.isdigit():
value = int(value)
elif value.startswith('"') and value.endswith('"'):
value = value.strip('"')
config_data[key] = value
elif len(parts) == 2:
config_data[key] = 1
return config_data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Read a config.h file.")
parser.add_argument("file1", help="Path to the config.h file.")
args = parser.parse_args()
config_data = read_config_file(args.file1)
fdat = []
for key, value in config_data.items():
fdat.append(f'{key}={value}')
result = '\n'.join(fdat)
f = open("./build/config.kconf", "a")
f.write(result)
f.close()

View File

@ -8,13 +8,17 @@ _test_input_array = [
{'id': 'cblat1', 'has_dat': false},
{'id': 'cblat2', 'has_dat': true},
{'id': 'cblat3', 'has_dat': true},
{'id': 'cblat3_3m', 'has_dat': true},
{'id': 'zblat1', 'has_dat': false},
{'id': 'zblat2', 'has_dat': true},
{'id': 'zblat3', 'has_dat': true},
{'id': 'zblat3_3m', 'has_dat': true},
]
if conf_hdat.has('ARCH_X86_64') or conf_hdat.has('ARCH_X86')
_test_input_array += [
{'id': 'cblat3_3m', 'has_dat': true}, {'id': 'zblat3_3m', 'has_dat': true},
]
endif
_test_runner = executable('test_runner', sources: ['test_runner.c'], install: false)
foreach _test : _test_input_array