From af0e7f1c8a76937cf097e388f9b84cf372d4211b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= Date: Tue, 6 Aug 2024 18:55:54 +0200 Subject: [PATCH] BLD: Support x86_64 and arm64 architectures --- .github/workflows/meson_linux_darwin.yml | 2 +- ctest/meson.build | 24 +- kernel/arm64/meson.build | 59 + kernel/arm64/meson_armv8/meson.build | 374 +++++ kernel/meson.build | 1631 ++++++++----------- kernel/meson_base/meson.build | 513 ++++++ kernel/x86_64/meson.build | 385 +++++ kernel/x86_64/meson_haswell/meson.build | 238 +++ kernel/x86_64/meson_sandybridge/meson.build | 213 +++ kernel/x86_64/meson_skylakex/meson.build | 109 ++ kernel/x86_64/meson_zen/meson.build | 228 +++ meson.build | 51 +- read_config.py | 39 + test/meson.build | 8 +- 14 files changed, 2863 insertions(+), 1011 deletions(-) create mode 100644 kernel/arm64/meson.build create mode 100644 kernel/arm64/meson_armv8/meson.build create mode 100644 kernel/meson_base/meson.build create mode 100644 kernel/x86_64/meson_haswell/meson.build create mode 100644 kernel/x86_64/meson_sandybridge/meson.build create mode 100644 kernel/x86_64/meson_skylakex/meson.build create mode 100644 kernel/x86_64/meson_zen/meson.build create mode 100644 read_config.py diff --git a/.github/workflows/meson_linux_darwin.yml b/.github/workflows/meson_linux_darwin.yml index ab63be52e..3f7b56b9c 100644 --- a/.github/workflows/meson_linux_darwin.yml +++ b/.github/workflows/meson_linux_darwin.yml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-13] + os: [ubuntu-latest, macos-13, macos-latest] build: [meson] fortran: [gfortran] openmp: [0] diff --git a/ctest/meson.build b/ctest/meson.build index e7defe8c3..a6a5049b8 100644 --- a/ctest/meson.build +++ b/ctest/meson.build @@ -24,19 +24,27 @@ _test_input_array = { 'sources': testl3_src, 'input_file': '?in3', }, - 'l3_3m': { - 'base': 'x?cblat3_3m', - 'has_dat': true, - 'types': ['c', 'z'], - 'sources': testl3_3m_src, - 'input_file': '?in3_3m', - }, } +lvls = ['l1', 'l2', 'l3'] + +if conf_hdat.has('ARCH_X86_64') or conf_hdat.has('ARCH_X86') + _test_input_array += { + 'l3_3m': { + 'base': 'x?cblat3_3m', + 'has_dat': true, + 'types': ['c', 'z'], + 'sources': testl3_3m_src, + 'input_file': '?in3_3m', + } + } + lvls += 'l3_3m' +endif + _test_runner = executable('test_runner', sources: ['test_runner.c'], install: false) ctest_inc = _inc + [include_directories('.')] -foreach lvl : ['l1', 'l2', 'l3', 'l3_3m'] +foreach lvl : lvls details = _test_input_array[lvl] foreach type : details['types'] diff --git a/kernel/arm64/meson.build b/kernel/arm64/meson.build new file mode 100644 index 000000000..80208c6a6 --- /dev/null +++ b/kernel/arm64/meson.build @@ -0,0 +1,59 @@ +arm64_base_dict = { + '?sum': { + 's': { + '_k': 'arm64/sum.S', + }, + 'd': { + '_k': 'arm64/sum.S', + }, + 'c': { + '_k': 'arm64/csum.S' + }, + 'z': { + '_k': 'arm64/zsum.S', + }, + }, + '?nrm2': { + 's': { + '_k': 'arm/nrm2.c', + }, + 'd': { + '_k': 'arm/nrm2.c', + }, + 'c': { + '_k': 'arm/znrm2.c', + }, + 'z': { + '_k': 'arm/znrm2.c', + }, + }, + '?cabs': { + 's': { + '1': 'generic/cabs.c', + }, + 'd': { + '1': 'generic/cabs.c', + }, + }, + '?lsame': { + '': { + '': 'generic/lsame.c', + } + }, + '?gemm': { + 's': { + '_beta': 'generic/gemm_beta.c', + }, + 'd': { + '_beta': 'generic/gemm_beta.c', + }, + 'c': { + '_beta': 'generic/zgemm_beta.c', + }, + 'z': { + '_beta': 'generic/zgemm_beta.c', + }, + }, +} + +subdir('meson_armv8') diff --git a/kernel/arm64/meson_armv8/meson.build b/kernel/arm64/meson_armv8/meson.build new file mode 100644 index 000000000..8760be9f4 --- /dev/null +++ b/kernel/arm64/meson_armv8/meson.build @@ -0,0 +1,374 @@ +arm64_armv8_dict = { + '?amin': { + 's': { + '_k': 'arm/amin.c', + }, + 'd': { + '_k': 'arm/amin.c', + }, + 'c': { + '_k': 'arm/zamin.c', + }, + 'z': { + '_k': 'arm/zamin.c', + }, + }, + '?max': { + 's': { + '_k': 'arm/max.c', + }, + 'd': { + '_k': 'arm/max.c', + }, + }, + '?min': { + 's': { + '_k': 'arm/min.c', + }, + 'd': { + '_k': 'arm/min.c', + }, + }, + 'i?amin': { + 's': { + '_k': 'arm/iamin.c', + }, + 'd': { + '_k': 'arm/iamin.c', + }, + 'c': { + '_k': 'arm/izamin.c', + }, + 'z': { + '_k': 'arm/izamin.c', + }, + }, + 'i?max': { + 's': { + '_k': 'arm/imax.c', + }, + 'd': { + '_k': 'arm/imax.c', + }, + }, + 'i?min': { + 's': { + '_k': 'arm/imin.c', + }, + 'd': { + '_k': 'arm/imin.c', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?amax': { + 's': { + '_k': 'arm64/amax.S', + }, + 'd': { + '_k': 'arm64/amax.S', + }, + 'c': { + '_k': 'arm64/zamax.S', + }, + 'z': { + '_k': 'arm64/zamax.S', + }, + }, + '?axpy': { + 's': { + '_k': 'arm64/axpy.S', + }, + 'd': { + '_k': 'arm64/axpy.S', + }, + 'c': { + '_k': 'arm64/zaxpy.S', + }, + 'z': { + '_k': 'arm64/zaxpy.S', + }, + }, + '?axpyc': { + 'c': { + '_k': 'arm64/zaxpy.S', + }, + 'z': { + '_k': 'arm64/zaxpy.S', + }, + }, + '?rot': { + 's': { + '_k': 'arm64/rot.S', + }, + 'd': { + '_k': 'arm64/rot.S', + }, + 'cs': { + '_k': 'arm64/zrot.S', + }, + 'zd': { + '_k': 'arm64/zrot.S', + } + }, + '?scal': { + 's': { + '_k': 'arm64/scal.S', + }, + 'd': { + '_k': 'arm64/scal.S', + }, + 'c': { + '_k': 'arm64/zscal.S', + }, + 'z': { + '_k': 'arm64/zscal.S', + }, + }, + '?gemv': { + 's': { + '_n': 'arm64/gemv_n.S', + '_t': 'arm64/gemv_t.S', + }, + 'd': { + '_n': 'arm64/gemv_n.S', + '_t': 'arm64/gemv_t.S', + }, + 'c': { + '_n': 'arm64/zgemv_n.S', + '_t': 'arm64/zgemv_t.S', + '_r': 'arm64/zgemv_n.S', + '_c': 'arm64/zgemv_t.S', + '_o': 'arm64/zgemv_n.S', + '_u': 'arm64/zgemv_t.S', + '_s': 'arm64/zgemv_n.S', + '_d': 'arm64/zgemv_t.S', + }, + 'z': { + '_n': 'arm64/zgemv_n.S', + '_t': 'arm64/zgemv_t.S', + '_r': 'arm64/zgemv_n.S', + '_c': 'arm64/zgemv_t.S', + '_o': 'arm64/zgemv_n.S', + '_u': 'arm64/zgemv_t.S', + '_s': 'arm64/zgemv_n.S', + '_d': 'arm64/zgemv_t.S', + }, + }, + '?asum': { + 's': { + '_k': 'arm64/asum.S', + }, + 'd': { + '_k': 'arm64/asum.S', + }, + 'c': { + '_k': 'arm64/casum.S', + }, + 'z': { + '_k': 'arm64/zasum.S', + }, + }, + '?copy': { + 's': { + '_k': 'arm64/copy.S', + }, + 'd': { + '_k': 'arm64/copy.S', + }, + 'c': { + '_k': 'arm64/copy.S', + }, + 'z': { + '_k': 'arm64/copy.S', + }, + }, + '?swap': { + 's': { + '_k': 'arm64/swap.S', + }, + 'd': { + '_k': 'arm64/swap.S', + }, + 'c': { + '_k': 'arm64/swap.S', + }, + 'z': { + '_k': 'arm64/swap.S', + }, + }, + 'i?amax': { + 's': { + '_k': 'arm64/iamax.S', + }, + 'd': { + '_k': 'arm64/iamax.S', + }, + 'c': { + '_k': 'arm64/izamax.S', + }, + 'z': { + '_k': 'arm64/izamax.S', + }, + }, + '?nrm2': { + 's': { + '_k': 'arm64/nrm2.S', + }, + 'd': { + '_k': 'arm64/nrm2.S', + }, + 'c': { + '_k': 'arm64/znrm2.S', + }, + 'z': { + '_k': 'arm64/znrm2.S', + }, + }, + '?dot': { + 's': { + '_k': 'generic/dot.c', + }, + 'd': { + '_k': 'arm64/dot.S', + }, + }, + '?dotc': { + 'c': { + '_k': 'arm64/zdot.S', + }, + 'z': { + '_k': 'arm64/zdot.S', + }, + }, + '?dotu': { + 'c': { + '_k': 'arm64/zdot.S', + }, + 'z': { + '_k': 'arm64/zdot.S', + }, + }, + '?dsdot': { + 's': { + '_k': 'arm64/dot.S', + }, + '': { + '_k': 'arm64/dot.S', + }, + }, + '?gemm': { + 's': { + '_beta': 'arm64/sgemm_beta.S', + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'arm64/sgemm_tcopy_16.S', + '_oncopy': 'arm64/sgemm_ncopy_4.S', + '_otcopy': 'generic/gemm_tcopy_4.c', + }, + 'd': { + '_beta': 'arm64/dgemm_beta.S', + '_incopy': 'arm64/dgemm_ncopy_8.S', + '_itcopy': 'arm64/dgemm_tcopy_8.S', + '_oncopy': 'arm64/dgemm_ncopy_4.S', + '_otcopy': 'arm64/dgemm_tcopy_4.S', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_4.c', + '_otcopy': 'generic/zgemm_tcopy_4.c' + }, + 'z': { + '_incopy': 'generic/zgemm_ncopy_4.c', + '_itcopy': 'generic/zgemm_tcopy_4.c', + '_oncopy': 'generic/zgemm_ncopy_4.c', + '_otcopy': 'generic/zgemm_tcopy_4.c' + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'arm64/strmm_kernel_16x4.S', + '_LT': 'arm64/strmm_kernel_16x4.S', + '_RN': 'arm64/strmm_kernel_16x4.S', + '_RT': 'arm64/strmm_kernel_16x4.S', + }, + 'd': { + '_LN': 'arm64/dtrmm_kernel_8x4.S', + '_LT': 'arm64/dtrmm_kernel_8x4.S', + '_RN': 'arm64/dtrmm_kernel_8x4.S', + '_RT': 'arm64/dtrmm_kernel_8x4.S', + }, + 'c': { + '_LN': 'arm64/ctrmm_kernel_8x4.S', + '_LT': 'arm64/ctrmm_kernel_8x4.S', + '_LR': 'arm64/ctrmm_kernel_8x4.S', + '_LC': 'arm64/ctrmm_kernel_8x4.S', + '_RN': 'arm64/ctrmm_kernel_8x4.S', + '_RT': 'arm64/ctrmm_kernel_8x4.S', + '_RR': 'arm64/ctrmm_kernel_8x4.S', + '_RC': 'arm64/ctrmm_kernel_8x4.S', + }, + 'z': { + '_LN': 'arm64/ztrmm_kernel_4x4.S', + '_LT': 'arm64/ztrmm_kernel_4x4.S', + '_LR': 'arm64/ztrmm_kernel_4x4.S', + '_LC': 'arm64/ztrmm_kernel_4x4.S', + '_RN': 'arm64/ztrmm_kernel_4x4.S', + '_RT': 'arm64/ztrmm_kernel_4x4.S', + '_RR': 'arm64/ztrmm_kernel_4x4.S', + '_RC': 'arm64/ztrmm_kernel_4x4.S', + }, + }, + '?gemm_kernel': { + 's': { + '': 'arm64/sgemm_kernel_16x4.S', + }, + 'd': { + '': 'arm64/dgemm_kernel_8x4.S', + }, + 'c': { + '_n': 'arm64/cgemm_kernel_8x4.S', + '_l': 'arm64/cgemm_kernel_8x4.S', + '_r': 'arm64/cgemm_kernel_8x4.S', + '_b': 'arm64/cgemm_kernel_8x4.S', + }, + 'z': { + '_n': 'arm64/zgemm_kernel_4x4.S', + '_l': 'arm64/zgemm_kernel_4x4.S', + '_r': 'arm64/zgemm_kernel_4x4.S', + '_b': 'arm64/zgemm_kernel_4x4.S', + }, + }, +} diff --git a/kernel/meson.build b/kernel/meson.build index 1b8abe8dd..b26ba9d1f 100644 --- a/kernel/meson.build +++ b/kernel/meson.build @@ -1,9 +1,28 @@ # Ordered As per https://netlib.org/blas/blasqr.pdf # NOTE: xROTG xROTMG xROTM have no kernels? # TODO: Actually test and set this -if true - fma3_flag = '-mfma' +fma3_flag = [] +if conf_hdat.has('HAVE_FMA3') + fma3_flag += '-mfma' endif + +SGEMM_UNROLL_N = makefile_conf_dat.get('SGEMM_UNROLL_N') +SGEMM_UNROLL_M = makefile_conf_dat.get('SGEMM_UNROLL_M') +DGEMM_UNROLL_N = makefile_conf_dat.get('DGEMM_UNROLL_N') +DGEMM_UNROLL_M = makefile_conf_dat.get('DGEMM_UNROLL_M') +CGEMM_UNROLL_N = makefile_conf_dat.get('CGEMM_UNROLL_N') +CGEMM_UNROLL_M = makefile_conf_dat.get('CGEMM_UNROLL_M') +ZGEMM_UNROLL_N = makefile_conf_dat.get('ZGEMM_UNROLL_N') +ZGEMM_UNROLL_M = makefile_conf_dat.get('ZGEMM_UNROLL_M') +CGEMM3M_UNROLL_N = makefile_conf_dat.get('CGEMM3M_UNROLL_N') +CGEMM3M_UNROLL_M = makefile_conf_dat.get('CGEMM3M_UNROLL_M') +ZGEMM3M_UNROLL_N = makefile_conf_dat.get('ZGEMM3M_UNROLL_N') +ZGEMM3M_UNROLL_M = makefile_conf_dat.get('ZGEMM3M_UNROLL_M') + +subdir('meson_base') +subdir('x86_64') +subdir('arm64') + # TODO: This is currently following x86_64 generic for src and dir, but it needs # to diversify # NOTE: The def and undefs are from Makefile.L1 @@ -14,187 +33,187 @@ base_kops = [ # Level 1 BLAS { 'base': '?rot', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'srot.c', 'addl': [fma3_flag]}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'drot.c', 'addl': [fma3_flag]}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zrot.c'}}}, - 'cs': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zrot_sse.S'}}}, - 'zd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zrot_sse2.S'}}}, - # 'xq': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zrot.S'}}}, + 's': {'exts': {'_k': {'addl': fma3_flag}}}, + 'd': {'exts': {'_k': {'addl': fma3_flag}}}, + # 'q': {'exts': {'_k': {}}}, + 'cs': {'exts': {'_k': {}}}, + 'zd': {'exts': {'_k': {}}}, + # 'xq': {'exts': {'_k': {}}}, }, }, { 'base': '?swap', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'swap_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'swap_sse2.S'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zswap_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zswap_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'swap.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zswap.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?scal', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'sscal.c'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'dscal.c'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'cscal.c'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zscal.c'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'scal.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zscal.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?copy', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'copy_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'copy_sse2.S'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zcopy_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zcopy_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'copy.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zcopy.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?axpy', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'saxpy.c'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'daxpy.c'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'caxpy.c', 'addl': ['-UCONJ']}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zaxpy.c', 'addl': ['-UCONJ']}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'axpy.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zaxpy.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {'addl': ['-UCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-UCONJ']}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?axpyc', 'modes': { - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'caxpy.c', 'addl': ['-DCONJ']}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zaxpy.c', 'addl': ['-DCONJ']}}}, + 'c': {'exts': {'_k': {'addl': ['-DCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-DCONJ']}}}, }, }, { 'base': '?dot', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'sdot.c'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'ddot.c'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, }, }, { 'base': '?dotc', 'modes': { - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'cdot.c'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zdot.c'}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, }, }, { 'base': '?dotu', 'modes': { - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'cdot.c'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zdot.c'}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, }, }, # TODO(rg): Check? { 'base': '?dsdot', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'sdot.c'}}}, - '': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'sdot.c', 'addl': ['-DDSDOT']}}}, + 's': {'exts': {'_k': {}}}, + '': {'exts': {'_k': {'addl': ['-DDSDOT']}}}, }, }, # TODO(rg): Add dsdotkernel conditionals # xDOTU xDOTC xxDOT aren't present { 'base': '?nrm2', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'nrm2_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'nrm2.S'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'znrm2_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'znrm2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'nrm2.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'znrm2.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?asum', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'sasum.c'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'dasum.c'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zasum_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zasum_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'asum.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zasum.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?amax', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse2.S'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zamax_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zamax_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zamax.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?sum', 'modes': { - 's': {'exts': {'_k': {'dir': 'arm', 'kernel': 'sum.c'}}}, - 'd': {'exts': {'_k': {'dir': 'arm', 'kernel': 'sum.c'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zsum_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zsum_sse2.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, }, }, { 'base': '?amin', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse2.S'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zamax_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'zamax_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'izamax.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': 'i?amax', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax_sse2.S'}}}, - 'c': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'izamax_sse.S'}}}, - 'z': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'izamax_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax.S'}}}, - # 'x': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'izamax.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': 'i?amin', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'arm', 'kernel': 'iamin.c'}}}, - 'c': {'exts': {'_k': {'dir': 'arm', 'kernel': 'izamin.c'}}}, - 'z': {'exts': {'_k': {'dir': 'arm', 'kernel': 'izamin.c'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, }, }, { 'base': 'i?max', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, }, }, { 'base': 'i?min', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'iamax.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, }, }, { 'base': '?max', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, }, }, { 'base': '?min', 'modes': { - 's': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse.S'}}}, - 'd': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax_sse2.S'}}}, - # 'q': {'exts': {'_k': {'dir': 'x86_64', 'kernel': 'amax.S'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, }, }, { 'base': '?axpby', 'modes': { - 's': {'exts': {'_k': {'dir': 'arm', 'kernel': 'axpby.c'}}}, - 'd': {'exts': {'_k': {'dir': 'arm', 'kernel': 'axpby.c'}}}, - 'c': {'exts': {'_k': {'dir': 'arm', 'kernel': 'zaxpby.c'}}}, - 'z': {'exts': {'_k': {'dir': 'arm', 'kernel': 'zaxpby.c'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, }, }, # Level 2 BLAS @@ -210,56 +229,56 @@ base_kops = [ # TODO(rg): Where are these coming from?? # Most of these have both generic defines and also per-folder defines.. # Makefile lists sgemv_n_4.c as the source, though there is a sgemv_n.c - '_n': {'dir': 'x86_64', 'kernel': 'sgemv_n_4.c'}, - '_t': {'dir': 'x86_64', 'kernel': 'sgemv_t_4.c'}, + '_n': {}, + '_t': {}, } }, 'd': { 'exts': { - '_n': {'dir': 'x86_64', 'kernel': 'dgemv_n_4.c'}, - '_t': {'dir': 'x86_64', 'kernel': 'dgemv_t_4.c'}, + '_n': {}, + '_t': {}, } }, # 'q': { # 'exts': { - # '_n': {'dir': 'x86_64', 'kernel': 'qgemv_n.S'}, - # '_t': {'dir': 'x86_64', 'kernel': 'qgemv_t.S'}, + # '_n': {}, + # '_t': {}, # } # }, 'c': { 'exts': { - '_n': {'dir': 'x86_64', 'kernel': 'cgemv_n_4.c'}, - '_t': {'dir': 'x86_64', 'kernel': 'cgemv_t_4.c'}, - '_r': {'dir': 'x86_64', 'kernel': 'cgemv_n_4.c'}, - '_c': {'dir': 'x86_64', 'kernel': 'cgemv_t_4.c'}, - '_o': {'dir': 'x86_64', 'kernel': 'cgemv_n_4.c'}, - '_u': {'dir': 'x86_64', 'kernel': 'cgemv_t_4.c'}, - '_s': {'dir': 'x86_64', 'kernel': 'cgemv_n_4.c'}, - '_d': {'dir': 'x86_64', 'kernel': 'cgemv_t_4.c'}, + '_n': {}, + '_t': {}, + '_r': {}, + '_c': {}, + '_o': {}, + '_u': {}, + '_s': {}, + '_d': {}, } }, 'z': { 'exts': { - '_n': {'dir': 'x86_64', 'kernel': 'zgemv_n_4.c'}, - '_t': {'dir': 'x86_64', 'kernel': 'zgemv_t_4.c'}, - '_r': {'dir': 'x86_64', 'kernel': 'zgemv_n_4.c'}, - '_c': {'dir': 'x86_64', 'kernel': 'zgemv_t_4.c'}, - '_o': {'dir': 'x86_64', 'kernel': 'zgemv_n_4.c'}, - '_u': {'dir': 'x86_64', 'kernel': 'zgemv_t_4.c'}, - '_s': {'dir': 'x86_64', 'kernel': 'zgemv_n_4.c'}, - '_d': {'dir': 'x86_64', 'kernel': 'zgemv_t_4.c'}, + '_n': {}, + '_t': {}, + '_r': {}, + '_c': {}, + '_o': {}, + '_u': {}, + '_s': {}, + '_d': {}, } }, # 'x': { # 'exts': { - # '_n': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, - # '_t': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, - # '_r': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, - # '_c': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, - # '_o': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, - # '_u': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, - # '_s': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, - # '_d': {'dir': 'arm', 'kernel': 'zgemv_n.c'}, + # '_n': {}, + # '_t': {}, + # '_r': {}, + # '_c': {}, + # '_o': {}, + # '_u': {}, + # '_s': {}, + # '_d': {}, # } # }, }, @@ -268,165 +287,139 @@ base_kops = [ 'modes': { 's': { 'exts': { - '_U': {'dir': 'x86_64', 'kernel': 'ssymv_U.c'}, - '_L': {'dir': 'x86_64', 'kernel': 'ssymv_L.c'}, + '_U': {}, + '_L': {}, } }, 'd': { 'exts': { - '_U': {'dir': 'generic', 'kernel': 'symv_k.c'}, - '_L': {'dir': 'generic', 'kernel': 'symv_k.c'}, + '_U': {}, + '_L': {}, } }, 'c': { 'exts': { - '_U': {'dir': 'generic', 'kernel': 'zsymv_k.c'}, - '_L': {'dir': 'generic', 'kernel': 'zsymv_k.c'}, + '_U': {}, + '_L': {}, } }, 'z': { 'exts': { - '_U': {'dir': 'x86_64', 'kernel': 'zsymv_U_sse2.S'}, - '_L': {'dir': 'x86_64', 'kernel': 'zsymv_L_sse2.S'}, + '_U': {}, + '_L': {}, } }, # 'q': { # 'exts': { - # '_U': {'dir': 'generic', 'kernel': 'symv_k.c'}, - # '_L': {'dir': 'generic', 'kernel': 'symv_k.c'}, + # '_U': {}, + # '_L': {}, # } # }, # 'x': { # 'exts': { - # '_U': {'dir': 'generic', 'kernel': 'zsymv_k.c'}, - # '_L': {'dir': 'generic', 'kernel': 'zsymv_k.c'}, + # '_U': {}, + # '_L': {}, # } # }, }, }, { 'base': '?lsame', 'modes': { - '': {'exts': {'': {'dir': 'x86_64', 'kernel': 'lsame.S'}}}, + '': {'exts': {'': {}}}, }, }, { 'base': '?cabs', 'modes': { - 's': {'exts': {'1': {'dir': 'x86_64', 'kernel': 'cabs.S'}}}, - 'd': {'exts': {'1': {'dir': 'x86_64', 'kernel': 'cabs.S'}}}, - # 'q': {'exts': {'': {'dir': 'generic', 'kernel': 'cabs.c'}}}, + 's': {'exts': {'1': {}}}, + 'd': {'exts': {'1': {}}}, + # 'q': {'exts': {'': {}}}, }, }, { 'base': '?gemm3m', 'modes': { 'c': {'exts': { - '_kernel': {'dir': 'x86_64', 'kernel': 'cgemm3m_kernel_8x4_haswell.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA']}, - '_oncopyb': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DUSE_ALPHA']}, - '_otcopyb': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DUSE_ALPHA']}, - '_itcopyb': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_8.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA']}, - '_itcopyr': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_8.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, - '_itcopyi': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_8.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, - '_incopyb': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_8.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA']}, - '_incopyr': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_8.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, - '_oncopyr': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, - '_otcopyr': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, - '_incopyi': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_8.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, - '_oncopyi': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, - '_otcopyi': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + '_kernel': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_oncopyb': {'addl': ['-DUSE_ALPHA']}, + '_otcopyb': {'addl': ['-DUSE_ALPHA']}, + '_itcopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_itcopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_itcopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_incopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_incopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_oncopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_otcopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_incopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_oncopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + '_otcopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, }}, 'z': {'exts': { - '_kernel': {'dir': 'x86_64', 'kernel': 'zgemm3m_kernel_4x4_haswell.c', - 'addl': ['-DNN']}, - '_oncopyb': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DUSE_ALPHA']}, - '_otcopyb': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DUSE_ALPHA']}, - '_itcopyb': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA']}, - '_itcopyr': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, - '_itcopyi': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, - '_incopyb': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA']}, - '_incopyr': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, - '_oncopyr': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, - '_otcopyr': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, - '_incopyi': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, - '_oncopyi': {'dir': 'generic', 'kernel': 'zgemm3m_ncopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, - '_otcopyi': {'dir': 'generic', 'kernel': 'zgemm3m_tcopy_4.c', - 'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + '_kernel': {'addl': ['-DNN']}, + '_oncopyb': {'addl': ['-DUSE_ALPHA']}, + '_otcopyb': {'addl': ['-DUSE_ALPHA']}, + '_itcopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_itcopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_itcopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_incopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_incopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_oncopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_otcopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_incopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_oncopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + '_otcopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, }}, }, }, { 'base': '?ger', 'modes': { - 's': {'exts': {'_k': {'dir': 'generic', 'kernel': 'ger.c'}}}, - 'd': {'exts': {'_k': {'dir': 'generic', 'kernel': 'ger.c'}}}, - # 'q': {'exts': {'_k': {'dir': 'generic', 'kernel': 'ger.c'}}}, + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, }, }, { 'base': '?geru', 'modes': { - 'c': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c', 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UCONJ']}}}, - 'z': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c', 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UCONJ']}}}, - # 'x': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c'}}}, + 'c': {'exts': {'_k': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UCONJ']}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?gerc', 'modes': { - 'c': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c', 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DCONJ']}}}, - 'z': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c', 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DCONJ']}}}, - # 'x': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c'}}}, + 'c': {'exts': {'_k': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DCONJ']}}}, + # 'x': {'exts': {'_k': {}}}, }, }, { 'base': '?gerv', 'modes': { - 'c': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c', 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UCONJ', '-DXCONJ']}}}, - 'z': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zger.c', 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UCONJ', '-DXCONJ']}}}, + 'c': {'exts': {'_k': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UCONJ', '-DXCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UCONJ', '-DXCONJ']}}}, }, }, { 'base': '?hemv', 'modes': { 'c': { 'exts': { - '_U': {'dir': 'generic', 'kernel': 'zhemv_k.c', 'addl': ['-DHEMV']}, - '_L': {'dir': 'generic', 'kernel': 'zhemv_k.c', 'addl': ['-DHEMV']}, - '_V': {'dir': 'generic', 'kernel': 'zhemv_k.c', 'addl': ['-DHEMV', '-DHEMVREV']}, - '_M': {'dir': 'generic', 'kernel': 'zhemv_k.c', 'addl': ['-DHEMV', '-DHEMVREV']}, + '_U': {'addl': ['-DHEMV']}, + '_L': {'addl': ['-DHEMV']}, + '_V': {'addl': ['-DHEMV', '-DHEMVREV']}, + '_M': {'addl': ['-DHEMV', '-DHEMVREV']}, } }, 'z': { 'exts': { - '_U': {'dir': 'x86_64', 'kernel': 'zsymv_U_sse2.S', 'addl': ['-DHEMV']}, - '_L': {'dir': 'x86_64', 'kernel': 'zsymv_L_sse2.S', 'addl': ['-DHEMV']}, - '_V': {'dir': 'generic', 'kernel': 'zhemv_k.c', 'addl': ['-DHEMV', '-DHEMVREV']}, - '_M': {'dir': 'generic', 'kernel': 'zhemv_k.c', 'addl': ['-DHEMV', '-DHEMVREV']}, + '_U': {'addl': ['-DHEMV']}, + '_L': {'addl': ['-DHEMV']}, + '_V': {'addl': ['-DHEMV', '-DHEMVREV']}, + '_M': {'addl': ['-DHEMV', '-DHEMVREV']}, } }, # 'x': { # 'exts': { - # '_U': {'dir': 'generic', 'kernel': 'zhemv_k.c'}, - # '_L': {'dir': 'generic', 'kernel': 'zhemv_k.c'}, - # '_V': {'dir': 'generic', 'kernel': 'zhemv_k.c'}, - # '_M': {'dir': 'generic', 'kernel': 'zhemv_k.c'}, + # '_U': {}, + # '_L': {}, + # '_V': {}, + # '_M': {}, # } # }, }, @@ -435,8 +428,8 @@ base_kops = [ # 'modes': { # 's': { # 'exts': { - # '_n': {'dir': 'x86_64', 'kernel': 'sbgemv_n.c'}, - # '_t': {'dir': 'x86_64', 'kernel': 'sbgemv_n.c'}, + # '_n': {}, + # '_t': {}, # } # } # }, @@ -444,85 +437,69 @@ base_kops = [ # Level 3 symbols { 'base': '?gemm_kernel', 'modes': { - 's': {'exts': {'': {'dir': 'x86_64', 'kernel': 'sgemm_kernel_8x4_haswell_2.c'}}}, - 'd': {'exts': {'': {'dir': 'x86_64', 'kernel': 'dgemm_kernel_4x8_haswell.S'}}}, + 's': {'exts': {'': {}}}, + 'd': {'exts': {'': {}}}, 'c': { 'exts': { - '_n': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.c', 'addl': ['-DNN']}, - '_l': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.c', 'addl': ['-DCN']}, + '_n': {'addl': ['-DNN']}, + '_l': {'addl': ['-DCN']}, # TODO(rg): What about _r conditionals? Makefile.L3:2969 - '_r': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.c', 'addl': ['-DNC']}, - '_b': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.c', 'addl': ['-DCC']}, + '_r': {'addl': ['-DNC']}, + '_b': {'addl': ['-DCC']}, } }, 'z': { 'exts': { - '_n': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.c', 'addl': ['-DNN']}, - '_l': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.c', 'addl': ['-DCN']}, - '_r': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.c', 'addl': ['-DNC']}, - '_b': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.c', 'addl': ['-DCC']}, + '_n': {'addl': ['-DNN']}, + '_l': {'addl': ['-DCN']}, + '_r': {'addl': ['-DNC']}, + '_b': {'addl': ['-DCC']}, } } - # 'q': {'exts': {'': {'dir': 'generic', 'kernel': 'gemm_beta.c'}}}, - # 'x': {'exts': {'': {'dir': 'generic', 'kernel': 'zgemm_beta.c'}}}, + # 'q': {'exts': {'': {}}}, + # 'x': {'exts': {'': {}}}, }, }, { 'base': '?trmm_kernel', 'modes': { 's': { 'exts': { - '_LN': {'dir': 'x86_64', 'kernel': 'sgemm_kernel_8x4_haswell.c'}, - '_LT': {'dir': 'x86_64', 'kernel': 'sgemm_kernel_8x4_haswell.c', 'addl': ['-DLEFT', '-DTRANSA']}, - '_RN': {'dir': 'x86_64', 'kernel': 'sgemm_kernel_8x4_haswell.c'}, - '_RT': {'dir': 'x86_64', 'kernel': 'sgemm_kernel_8x4_haswell.c'}, + '_LN': {}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA']}, + '_RN': {}, + '_RT': {}, } }, 'd': { 'exts': { - '_LN': {'dir': 'x86_64', 'kernel': 'dtrmm_kernel_4x8_haswell.c'}, - '_LT': {'dir': 'x86_64', 'kernel': 'dtrmm_kernel_4x8_haswell.c', 'addl': ['-DLEFT', '-DTRANSA']}, - '_RN': {'dir': 'x86_64', 'kernel': 'dtrmm_kernel_4x8_haswell.c'}, - '_RT': {'dir': 'x86_64', 'kernel': 'dtrmm_kernel_4x8_haswell.c'}, + '_LN': {}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA']}, + '_RN': {}, + '_RT': {}, } }, 'c': { 'exts': { - '_LN': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-DLEFT', '-UTRANSA', '-UCONJ', '-DNN']}, - '_LT': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-DLEFT', '-DTRANSA', '-UCONJ', '-DNN']}, - '_LR': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-DLEFT', '-UTRANSA', '-DCONJ', '-DCN']}, - '_LC': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-DLEFT', '-DTRANSA', '-DCONJ', '-DCN']}, - '_RN': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-ULEFT', '-UTRANSA', '-UCONJ', '-DNN']}, - '_RT': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-ULEFT', '-DTRANSA', '-UCONJ', '-DNN']}, - '_RR': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-ULEFT', '-UTRANSA', '-DCONJ', '-DNC']}, - '_RC': {'dir': 'x86_64', 'kernel': 'cgemm_kernel_8x2_haswell.S', - 'addl': ['-ULEFT', '-DTRANSA', '-DCONJ', '-DNC']}, + '_LN': {'addl': ['-DLEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_LR': {'addl': ['-DLEFT', '-UTRANSA', '-DCONJ', '-DCN']}, + '_LC': {'addl': ['-DLEFT', '-DTRANSA', '-DCONJ', '-DCN']}, + '_RN': {'addl': ['-ULEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_RT': {'addl': ['-ULEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_RR': {'addl': ['-ULEFT', '-UTRANSA', '-DCONJ', '-DNC']}, + '_RC': {'addl': ['-ULEFT', '-DTRANSA', '-DCONJ', '-DNC']}, } }, 'z': { 'exts': { - '_LN': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-DLEFT', '-UTRANSA', '-UCONJ', '-DNN']}, - '_LT': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-DLEFT', '-DTRANSA', '-UCONJ', '-DNN']}, - '_LR': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-DLEFT', '-UTRANSA', '-DCONJ', '-DCN']}, - '_LC': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-DLEFT', '-DTRANSA', '-DCONJ', '-DCN']}, - '_RN': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-ULEFT', '-UTRANSA', '-UCONJ', '-DNN']}, - '_RT': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-ULEFT', '-DTRANSA', '-UCONJ', '-DNN']}, - '_RR': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-ULEFT', '-UTRANSA', '-DCONJ', '-DNC']}, - '_RC': {'dir': 'x86_64', 'kernel': 'zgemm_kernel_4x2_haswell.S', - 'addl': ['-ULEFT', '-DTRANSA', '-DCONJ', '-DNC']}, + '_LN': {'addl': ['-DLEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_LR': {'addl': ['-DLEFT', '-UTRANSA', '-DCONJ', '-DCN']}, + '_LC': {'addl': ['-DLEFT', '-DTRANSA', '-DCONJ', '-DCN']}, + '_RN': {'addl': ['-ULEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_RT': {'addl': ['-ULEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_RR': {'addl': ['-ULEFT', '-UTRANSA', '-DCONJ', '-DNC']}, + '_RC': {'addl': ['-ULEFT', '-DTRANSA', '-DCONJ', '-DNC']}, }, }, }, @@ -531,66 +508,42 @@ base_kops = [ 'modes': { 's': { 'exts': { - '_LN': {'dir': 'x86_64', 'kernel': 'strsm_kernel_8x4_haswell_LN.c', - 'addl': ['-DLN', '-DUPPER', '-UCONJ']}, - '_LT': {'dir': 'x86_64', 'kernel': 'strsm_kernel_8x4_haswell_LT.c', - 'addl': ['-DLT', '-UUPPER', '-UCONJ']}, - '_RN': {'dir': 'x86_64', 'kernel': 'strsm_kernel_8x4_haswell_RN.c', - 'addl': ['-DRN', '-DUPPER', '-UCONJ']}, - '_RT': {'dir': 'x86_64', 'kernel': 'strsm_kernel_8x4_haswell_RT.c', - 'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, }, }, 'd': { 'exts': { - '_LN': {'dir': 'generic', 'kernel': 'trsm_kernel_LN.c', - 'addl': ['-DLN', '-DUPPER', '-UCONJ']}, - '_LT': {'dir': 'generic', 'kernel': 'trsm_kernel_LT.c', - 'addl': ['-DLT', '-UUPPER', '-UCONJ']}, - '_RN': {'dir': 'x86_64', 'kernel': 'dtrsm_kernel_RN_haswell.c', - 'addl': ['-DRN', '-DUPPER', '-UCONJ']}, - '_RT': {'dir': 'generic', 'kernel': 'trsm_kernel_RT.c', - 'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, }, }, 'c': { 'exts': { - '_LN': {'dir': 'generic', 'kernel': 'trsm_kernel_LN.c', - 'addl': ['-DLN', '-DUPPER', '-UCONJ']}, - '_LT': {'dir': 'generic', 'kernel': 'trsm_kernel_LT.c', - 'addl': ['-DLT', '-UUPPER', '-UCONJ']}, - '_LR': {'dir': 'generic', 'kernel': 'trsm_kernel_LN.c', - 'addl': ['-DLN', '-DUPPER', '-DCONJ']}, - '_LC': {'dir': 'generic', 'kernel': 'trsm_kernel_LT.c', - 'addl': ['-DLT', '-UUPPER', '-DCONJ']}, - '_RN': {'dir': 'generic', 'kernel': 'trsm_kernel_RN.c', - 'addl': ['-DRN', '-DUPPER', '-UCONJ']}, - '_RT': {'dir': 'generic', 'kernel': 'trsm_kernel_RT.c', - 'addl': ['-DRT', '-UUPPER', '-UCONJ']}, - '_RR': {'dir': 'generic', 'kernel': 'trsm_kernel_RN.c', - 'addl': ['-DRN', '-DUPPER', '-DCONJ']}, - '_RC': {'dir': 'generic', 'kernel': 'trsm_kernel_RT.c', - 'addl': ['-DRT', '-UUPPER', '-DCONJ']}, + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_LR': {'addl': ['-DLN', '-DUPPER', '-DCONJ']}, + '_LC': {'addl': ['-DLT', '-UUPPER', '-DCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + '_RR': {'addl': ['-DRN', '-DUPPER', '-DCONJ']}, + '_RC': {'addl': ['-DRT', '-UUPPER', '-DCONJ']}, }, }, 'z': { 'exts': { - '_LN': {'dir': 'generic', 'kernel': 'trsm_kernel_LN.c', - 'addl': ['-DLN', '-DUPPER', '-UCONJ']}, - '_LT': {'dir': 'generic', 'kernel': 'trsm_kernel_LT.c', - 'addl': ['-DLT', '-UUPPER', '-UCONJ']}, - '_LR': {'dir': 'generic', 'kernel': 'trsm_kernel_LN.c', - 'addl': ['-DLN', '-DUPPER', '-DCONJ']}, - '_LC': {'dir': 'generic', 'kernel': 'trsm_kernel_LT.c', - 'addl': ['-DLT', '-UUPPER', '-DCONJ']}, - '_RN': {'dir': 'generic', 'kernel': 'trsm_kernel_RN.c', - 'addl': ['-DRN', '-DUPPER', '-UCONJ']}, - '_RT': {'dir': 'generic', 'kernel': 'trsm_kernel_RT.c', - 'addl': ['-DRT', '-UUPPER', '-UCONJ']}, - '_RR': {'dir': 'generic', 'kernel': 'trsm_kernel_RN.c', - 'addl': ['-DRN', '-DUPPER', '-DCONJ']}, - '_RC': {'dir': 'generic', 'kernel': 'trsm_kernel_RT.c', - 'addl': ['-DRT', '-UUPPER', '-DCONJ']}, + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_LR': {'addl': ['-DLN', '-DUPPER', '-DCONJ']}, + '_LC': {'addl': ['-DLT', '-UUPPER', '-DCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + '_RR': {'addl': ['-DRN', '-DUPPER', '-DCONJ']}, + '_RC': {'addl': ['-DRT', '-UUPPER', '-DCONJ']}, }, }, }, @@ -598,44 +551,40 @@ base_kops = [ { 'base': '?gemm', 'modes': { 's': {'exts': { - '_beta': {'dir': 'x86_64', 'kernel': 'sgemm_beta_skylakex.c'}, - '_small_matrix_permit': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_permit.c'}, + '_beta': {}, + '_small_matrix_permit': {}, # TODO(rg): the _NUM prefixes are arch dependent - '_incopy': {'dir': 'generic', 'kernel': 'gemm_ncopy_8.c'}, - '_itcopy': {'dir': 'generic', 'kernel': 'gemm_tcopy_8.c'}, - '_oncopy': {'dir': 'x86_64', 'kernel': 'sgemm_ncopy_4_skylakex.c'}, - '_otcopy': {'dir': 'generic', 'kernel': 'gemm_tcopy_4.c'}, + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, # TODO(rg): direct and direct_performant are built only conditionally - '_direct': {'dir': 'x86_64', 'kernel': 'sgemm_direct_skylakex.c'}, - '_direct_performant': {'dir': 'x86_64', 'kernel': 'sgemm_direct_performant.c'}, + '_direct': {}, + '_direct_performant': {}, }}, 'd': {'exts': { - '_beta': {'dir': 'x86_64', 'kernel': 'dgemm_beta_skylakex.c'}, - '_small_matrix_permit': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_permit.c'}, - '_incopy': {'dir': 'generic', 'kernel': 'gemm_ncopy_4.c'}, - '_itcopy': {'dir': 'generic', 'kernel': 'gemm_tcopy_4.c'}, - '_oncopy': {'dir': 'x86_64', 'kernel': 'dgemm_ncopy_8_skylakex.c'}, - '_otcopy': {'dir': 'generic', 'kernel': 'gemm_tcopy_8.c'}, + '_beta': {}, + '_small_matrix_permit': {}, + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, }}, 'c': {'exts': { - '_beta': {'dir': 'x86_64', 'kernel': 'zgemm_beta.S'}, - '_small_matrix_permit': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_permit.c'}, - '_incopy': {'dir': 'generic', 'kernel': 'zgemm_ncopy_8.c'}, - '_itcopy': {'dir': 'generic', 'kernel': 'zgemm_tcopy_8.c'}, - '_oncopy': {'dir': 'generic', 'kernel': 'zgemm_ncopy_2.c'}, - '_otcopy': {'dir': 'generic', 'kernel': 'zgemm_tcopy_2.c'}, + '_beta': {}, + '_small_matrix_permit': {}, + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, }}, 'z': {'exts': { - '_beta': {'dir': 'x86_64', 'kernel': 'zgemm_beta.S'}, - '_small_matrix_permit': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_permit.c'}, - '_incopy': {'dir': 'generic', 'kernel': 'zgemm_ncopy_4.c'}, - '_itcopy': {'dir': 'generic', 'kernel': 'zgemm_tcopy_4.c'}, - '_oncopy': {'dir': 'generic', 'kernel': 'zgemm_ncopy_2.c'}, - '_otcopy': {'dir': 'generic', 'kernel': 'zgemm_tcopy_2.c'}, + '_beta': {}, + '_small_matrix_permit': {}, + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, }}, }, }, @@ -643,164 +592,92 @@ base_kops = [ 'modes': { 's': {'exts': { # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size - '_iunucopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, }}, 'd': {'exts': { - '_iunucopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'trmm_uncopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'trmm_lncopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'trmm_utcopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'trmm_ltcopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, }}, 'c': {'exts': { - '_iunucopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, }}, 'z': {'exts': { - '_iunucopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'ztrmm_uncopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'ztrmm_lncopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'ztrmm_utcopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'ztrmm_ltcopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, }}, }, }, { 'base': '?hemm', 'modes': { 'c': {'exts': { - '_iutcopy': {'dir': 'generic', 'kernel': 'zhemm_utcopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, - '_iltcopy': {'dir': 'generic', 'kernel': 'zhemm_ltcopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, - '_outcopy': {'dir': 'generic', 'kernel': 'zhemm_utcopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, - '_oltcopy': {'dir': 'generic', 'kernel': 'zhemm_ltcopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, + '_iutcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, }}, 'z': {'exts': { - '_iutcopy': {'dir': 'generic', 'kernel': 'zhemm_utcopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, - '_iltcopy': {'dir': 'generic', 'kernel': 'zhemm_ltcopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, - '_outcopy': {'dir': 'generic', 'kernel': 'zhemm_utcopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, - '_oltcopy': {'dir': 'generic', 'kernel': 'zhemm_ltcopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, + '_iutcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, }}, }, }, @@ -808,178 +685,98 @@ base_kops = [ 'modes': { 's': {'exts': { # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size - '_iunucopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, }}, 'd': {'exts': { # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size - '_iunucopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_8.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_8.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_8.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_8.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_8.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_8.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_8.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_8.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, }}, 'q': {'exts': { # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size - '_iunucopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'trsm_uncopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'trsm_lncopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'trsm_utcopy_4.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'trsm_ltcopy_4.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, }}, 'c': {'exts': { # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size - '_iunucopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_8.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_8.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, }}, 'z': {'exts': { # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size - '_iunucopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iunncopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_ilnucopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_ilnncopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_iutucopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, - '_iutncopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_4.c', - 'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, - '_iltucopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, - '_iltncopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_4.c', - 'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, - '_ounucopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_ounncopy': {'dir': 'generic', 'kernel': 'ztrsm_uncopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_olnucopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_olnncopy': {'dir': 'generic', 'kernel': 'ztrsm_lncopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, - '_outucopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, - '_outncopy': {'dir': 'generic', 'kernel': 'ztrsm_utcopy_2.c', - 'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, - '_oltucopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, - '_oltncopy': {'dir': 'generic', 'kernel': 'ztrsm_ltcopy_2.c', - 'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, }}, }, }, @@ -987,280 +784,172 @@ base_kops = [ 'modes': { 's': {'exts': { # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size - '_iutcopy': {'dir': 'generic', 'kernel': 'symm_ucopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER']}, - '_iltcopy': {'dir': 'generic', 'kernel': 'symm_lcopy_8.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER']}, - '_outcopy': {'dir': 'generic', 'kernel': 'symm_ucopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER']}, - '_oltcopy': {'dir': 'generic', 'kernel': 'symm_lcopy_4.c', - 'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER']}, + '_iutcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER']}, }}, 'd': {'exts': { - '_iutcopy': {'dir': 'generic', 'kernel': 'symm_ucopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER']}, - '_iltcopy': {'dir': 'generic', 'kernel': 'symm_lcopy_4.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER']}, - '_outcopy': {'dir': 'generic', 'kernel': 'symm_ucopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER']}, - '_oltcopy': {'dir': 'generic', 'kernel': 'symm_lcopy_8.c', - 'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER']}, + '_iutcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER']}, }}, 'c': {'exts': { - '_iutcopy': {'dir': 'generic', 'kernel': 'zsymm_ucopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, - '_iltcopy': {'dir': 'generic', 'kernel': 'zsymm_lcopy_8.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, - '_outcopy': {'dir': 'generic', 'kernel': 'zsymm_ucopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, - '_oltcopy': {'dir': 'generic', 'kernel': 'zsymm_lcopy_2.c', - 'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, + '_iutcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-UDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, }}, 'z': {'exts': { - '_iutcopy': {'dir': 'generic', 'kernel': 'zsymm_ucopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, - '_iltcopy': {'dir': 'generic', 'kernel': 'zsymm_lcopy_4.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, - '_outcopy': {'dir': 'generic', 'kernel': 'zsymm_ucopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, - '_oltcopy': {'dir': 'generic', 'kernel': 'zsymm_lcopy_2.c', - 'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, + '_iutcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DDOUBLE', '-DCOMPLEX', '-DOUTER', '-DLOWER']}, }}, }, }, { 'base': '?omatcopy_k', 'modes': { 's': {'exts': { - '_cn': {'dir': 'arm', 'kernel': 'omatcopy_cn.c', 'addl': ['-UROWM']}, - '_rn': {'dir': 'arm', 'kernel': 'omatcopy_rn.c', 'addl': ['-DROWM']}, - '_ct': {'dir': 'arm', 'kernel': 'omatcopy_ct.c', 'addl': ['-UROWM']}, - '_rt': {'dir': 'arm', 'kernel': 'omatcopy_rt.c', 'addl': ['-DROWM']}, + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, }}, 'd': {'exts': { - '_cn': {'dir': 'arm', 'kernel': 'omatcopy_cn.c', 'addl': ['-UROWM']}, - '_rn': {'dir': 'arm', 'kernel': 'omatcopy_rn.c', 'addl': ['-DROWM']}, - '_ct': {'dir': 'arm', 'kernel': 'omatcopy_ct.c', 'addl': ['-UROWM']}, - '_rt': {'dir': 'arm', 'kernel': 'omatcopy_rt.c', 'addl': ['-DROWM']}, + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, }}, 'c': {'exts': { - '_cn': {'dir': 'arm', 'kernel': 'zomatcopy_cn.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rn': {'dir': 'arm', 'kernel': 'zomatcopy_rn.c', 'addl': ['-DROWM', '-UCONJ']}, - '_ct': {'dir': 'arm', 'kernel': 'zomatcopy_ct.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rt': {'dir': 'arm', 'kernel': 'zomatcopy_rt.c', 'addl': ['-DROWM', '-UCONJ']}, - '_cnc': {'dir': 'arm', 'kernel': 'zomatcopy_cnc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rnc': {'dir': 'arm', 'kernel': 'zomatcopy_rnc.c', 'addl': ['-DROWM', '-DCONJ']}, - '_ctc': {'dir': 'arm', 'kernel': 'zomatcopy_ctc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rtc': {'dir': 'arm', 'kernel': 'zomatcopy_rtc.c', 'addl': ['-DROWM', '-DCONJ']}, + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, }}, 'z': {'exts': { - '_cn': {'dir': 'arm', 'kernel': 'zomatcopy_cn.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rn': {'dir': 'arm', 'kernel': 'zomatcopy_rn.c', 'addl': ['-DROWM', '-UCONJ']}, - '_ct': {'dir': 'arm', 'kernel': 'zomatcopy_ct.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rt': {'dir': 'arm', 'kernel': 'zomatcopy_rt.c', 'addl': ['-DROWM', '-UCONJ']}, - '_cnc': {'dir': 'arm', 'kernel': 'zomatcopy_cnc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rnc': {'dir': 'arm', 'kernel': 'zomatcopy_rnc.c', 'addl': ['-DROWM', '-DCONJ']}, - '_ctc': {'dir': 'arm', 'kernel': 'zomatcopy_ctc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rtc': {'dir': 'arm', 'kernel': 'zomatcopy_rtc.c', 'addl': ['-DROWM', '-DCONJ']}, + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, }}, }, }, { 'base': '?imatcopy_k', 'modes': { 's': {'exts': { - '_cn': {'dir': 'generic', 'kernel': 'imatcopy_cn.c', 'addl': ['-UROWM']}, - '_rn': {'dir': 'generic', 'kernel': 'imatcopy_rn.c', 'addl': ['-DROWM']}, - '_ct': {'dir': 'generic', 'kernel': 'imatcopy_ct.c', 'addl': ['-UROWM']}, - '_rt': {'dir': 'generic', 'kernel': 'imatcopy_rt.c', 'addl': ['-DROWM']}, + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, }}, 'd': {'exts': { - '_cn': {'dir': 'generic', 'kernel': 'imatcopy_cn.c', 'addl': ['-UROWM']}, - '_rn': {'dir': 'generic', 'kernel': 'imatcopy_rn.c', 'addl': ['-DROWM']}, - '_ct': {'dir': 'generic', 'kernel': 'imatcopy_ct.c', 'addl': ['-UROWM']}, - '_rt': {'dir': 'generic', 'kernel': 'imatcopy_rt.c', 'addl': ['-DROWM']}, + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, }}, 'c': {'exts': { - '_cn': {'dir': 'generic', 'kernel': 'zimatcopy_cn.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rn': {'dir': 'generic', 'kernel': 'zimatcopy_rn.c', 'addl': ['-DROWM', '-UCONJ']}, - '_ct': {'dir': 'generic', 'kernel': 'zimatcopy_ct.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rt': {'dir': 'generic', 'kernel': 'zimatcopy_rt.c', 'addl': ['-DROWM', '-UCONJ']}, - '_cnc': {'dir': 'generic', 'kernel': 'zimatcopy_cnc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rnc': {'dir': 'generic', 'kernel': 'zimatcopy_rnc.c', 'addl': ['-DROWM', '-DCONJ']}, - '_ctc': {'dir': 'generic', 'kernel': 'zimatcopy_ctc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rtc': {'dir': 'generic', 'kernel': 'zimatcopy_rtc.c', 'addl': ['-DROWM', '-DCONJ']}, + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, }}, 'z': {'exts': { - '_cn': {'dir': 'generic', 'kernel': 'zimatcopy_cn.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rn': {'dir': 'generic', 'kernel': 'zimatcopy_rn.c', 'addl': ['-DROWM', '-UCONJ']}, - '_ct': {'dir': 'generic', 'kernel': 'zimatcopy_ct.c', 'addl': ['-UROWM', '-UCONJ']}, - '_rt': {'dir': 'generic', 'kernel': 'zimatcopy_rt.c', 'addl': ['-DROWM', '-UCONJ']}, - '_cnc': {'dir': 'generic', 'kernel': 'zimatcopy_cnc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rnc': {'dir': 'generic', 'kernel': 'zimatcopy_rnc.c', 'addl': ['-DROWM', '-DCONJ']}, - '_ctc': {'dir': 'generic', 'kernel': 'zimatcopy_ctc.c', 'addl': ['-UROWM', '-DCONJ']}, - '_rtc': {'dir': 'generic', 'kernel': 'zimatcopy_rtc.c', 'addl': ['-DROWM', '-DCONJ']}, + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, }}, }, }, { 'base': '?geadd', 'modes': { - 's': {'exts': {'_k': {'dir': 'generic', 'kernel': 'geadd.c', 'addl': ['-UROWM']}}}, - 'd': {'exts': {'_k': {'dir': 'generic', 'kernel': 'geadd.c', 'addl': ['-UROWM']}}}, - 'c': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zgeadd.c', 'addl': ['-UROWM']}}}, - 'z': {'exts': {'_k': {'dir': 'generic', 'kernel': 'zgeadd.c', 'addl': ['-UROWM']}}}, + 's': {'exts': {'_k': {'addl': ['-UROWM']}}}, + 'd': {'exts': {'_k': {'addl': ['-UROWM']}}}, + 'c': {'exts': {'_k': {'addl': ['-UROWM']}}}, + 'z': {'exts': {'_k': {'addl': ['-UROWM']}}}, }, }, { 'base': '?gemm_small_kernel', 'modes': { 's': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tt.c', - }, - # '_b0_nn': {'dir': 'generic', - # 'kernel': 'gemm_small_matrix_kernel_nn.c', - # 'addl': ['-DB0'], - # }, - # '_b0_nt': {'dir': 'generic', - # 'kernel': 'gemm_small_matrix_kernel_nt.c', - # 'addl': ['-DB0'], - # }, - # '_b0_tn': {'dir': 'generic', - # 'kernel': 'gemm_small_matrix_kernel_tn.c', - # 'addl': ['-DB0'], - # }, - # '_b0_tt': {'dir': 'generic', - # 'kernel': 'gemm_small_matrix_kernel_tt.c', - # 'addl': ['-DB0'], - # }, + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, + # '_b0_nn': {'addl': ['-DB0']}, + # '_b0_nt': {'addl': ['-DB0']}, + # '_b0_tn': {'addl': ['-DB0']}, + # '_b0_tt': {'addl': ['-DB0']}, } }, 'd': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tt.c', - }, + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, } }, 'c': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_nc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_tc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_ct': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_cc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, } }, 'z': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_nc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_tc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_ct': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_cc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, } }, }, @@ -1269,144 +958,84 @@ base_kops = [ 'modes': { 's': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tt.c', - }, + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, } }, 'd': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'gemm_small_matrix_kernel_tt.c', - }, + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, } }, 'c': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_nc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_tc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_ct': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_cc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, } }, 'z': { 'exts': { - '_nn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_rr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nn.c', - }, - '_nt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_nc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_rc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_nt.c', - }, - '_tn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cn': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_cr': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tn.c', - }, - '_tt': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_tc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_ct': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, - '_cc': {'dir': 'generic', - 'kernel': 'zgemm_small_matrix_kernel_tt.c', - }, + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, } }, }, }, ] +search_order = [base_dict] + +if conf_hdat.has('ARCH_X86_64') + search_order = [x86_64_base_dict] + search_order + if conf_hdat.has('HASWELL') + search_order = [x86_64_haswell_dict] + search_order + elif conf_hdat.has('SKYLAKEX') + search_order = [x86_64_skylakex_dict, x86_64_haswell_dict] + search_order + elif conf_hdat.has('ZEN') + search_order = [x86_64_zen_dict] + search_order + elif conf_hdat.has('SANDYBRIDGE') + search_order = [x86_64_sandybridge_dict] + search_order + endif +elif conf_hdat.has('ARCH_ARM64') + search_order = [arm64_base_dict] + search_order + if conf_hdat.has('ARMV8') + search_order = [arm64_armv8_dict] + search_order + endif +endif + kernel_confs = [] foreach _kop : base_kops base = _kop['base'] @@ -1479,7 +1108,21 @@ foreach _kop : base_kops endforeach endif - src = join_paths(extdat['dir'], extdat['kernel']) + if base == '?gemm3m' and conf_hdat.has('ARCH_ARM64') + continue + endif + + src = '' + foreach dict : search_order + if dict.has_key(base) and dict[base].has_key(mode) and dict[base][mode].has_key(ext) + src = dict[base][mode][ext] + break + endif + endforeach + if src == '' + error(f'Missing src file for @base@ @mode@ @ext@', search_order, conf_hdat.keys()) + endif + if extdat.has_key('addl') _ext_cargs += extdat['addl'] endif diff --git a/kernel/meson_base/meson.build b/kernel/meson_base/meson.build new file mode 100644 index 000000000..943e00c32 --- /dev/null +++ b/kernel/meson_base/meson.build @@ -0,0 +1,513 @@ +base_dict = { + # Level 1 BLAS + '?axpy': { + 's': { + '_k': 'arm/axpy.c', + }, + 'd': { + '_k': 'arm/axpy.c', + }, + 'c': { + '_k': 'arm/zaxpy.c', + }, + 'z': { + '_k': 'arm/zaxpy.c', + }, + }, + '?axpby': { + 's': { + '_k': 'arm/axpby.c', + }, + 'd': { + '_k': 'arm/axpby.c', + }, + 'c': { + '_k': 'arm/zaxpby.c', + }, + 'z': { + '_k': 'arm/zaxpby.c', + }, + }, + # Level 2 BLAS + '?symv': { + 's': { + '_U': 'generic/symv_k.c', + '_L': 'generic/symv_k.c', + }, + 'd': { + '_U': 'generic/symv_k.c', + '_L': 'generic/symv_k.c', + }, + 'c': { + '_U': 'generic/zsymv_k.c', + '_L': 'generic/zsymv_k.c', + }, + 'z': { + '_U': 'generic/zsymv_k.c', + '_L': 'generic/zsymv_k.c', + }, + }, + '?ger': { + 's': { + '_k': 'generic/ger.c', + }, + 'd': { + '_k': 'generic/ger.c', + }, + }, + '?geru': { + 'c': { + '_k': 'generic/zger.c', + }, + 'z': { + '_k': 'generic/zger.c', + }, + }, + '?gerc': { + 'c': { + '_k': 'generic/zger.c', + }, + 'z': { + '_k': 'generic/zger.c', + }, + }, + '?gerv': { + 'c': { + '_k': 'generic/zger.c', + }, + 'z': { + '_k': 'generic/zger.c', + }, + }, + '?hemv': { + 'c': { + '_U': 'generic/zhemv_k.c', + '_L': 'generic/zhemv_k.c', + '_V': 'generic/zhemv_k.c', + '_M': 'generic/zhemv_k.c', + }, + 'z': { + '_U': 'generic/zhemv_k.c', + '_L': 'generic/zhemv_k.c', + '_V': 'generic/zhemv_k.c', + '_M': 'generic/zhemv_k.c', + }, + }, + # Level 3 BLAS + '?gemm': { + 's': { + '_direct': 'x86_64/sgemm_direct_skylakex.c', + '_direct_performant': 'x86_64/sgemm_direct_performant.c', + '_small_matrix_permit': 'generic/gemm_small_matrix_permit.c', + }, + 'd': { + '_small_matrix_permit': 'generic/gemm_small_matrix_permit.c', + }, + 'c': { + '_small_matrix_permit': 'generic/zgemm_small_matrix_permit.c', + }, + 'z': { + '_small_matrix_permit': 'generic/zgemm_small_matrix_permit.c', + }, + }, + '?gemm3m': { + 'c': { + '_oncopyb': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c', + '_oncopyi': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c', + '_oncopyr': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c', + '_otcopyb': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c', + '_otcopyr': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c', + '_otcopyi': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c', + '_incopyb': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c', + '_incopyr': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c', + '_incopyi': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c', + '_itcopyb': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c', + '_itcopyr': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c', + '_itcopyi': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c', + }, + 'z': { + '_oncopyb': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c', + '_oncopyi': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c', + '_oncopyr': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c', + '_otcopyb': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c', + '_otcopyi': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c', + '_otcopyr': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c', + '_incopyb': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c', + '_incopyi': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c', + '_incopyr': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c', + '_itcopyb': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c', + '_itcopyi': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c', + '_itcopyr': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c', + } + }, + '?trmm': { + 's': { + '_iunucopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_N@.c', + }, + 'd': { + '_iunucopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_N@.c', + }, + 'c': { + '_iunucopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_N@.c', + }, + 'z': { + '_iunucopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_N@.c', + }, + }, + '?hemm': { + 'c': { + '_iutcopy': f'generic/zhemm_utcopy_@CGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zhemm_ltcopy_@CGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zhemm_utcopy_@CGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zhemm_ltcopy_@CGEMM_UNROLL_N@.c', + }, + 'z': { + '_iutcopy': f'generic/zhemm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zhemm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zhemm_utcopy_@ZGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zhemm_ltcopy_@ZGEMM_UNROLL_N@.c', + }, + }, + '?trsm': { + 's': { + '_iunucopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_N@.c', + }, + 'd': { + '_iunucopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_N@.c', + }, + 'c': { + '_iunucopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_N@.c', + }, + 'z': { + '_iunucopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_N@.c', + }, + }, + '?symm': { + 's': { + '_iutcopy': f'generic/symm_ucopy_@SGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/symm_lcopy_@SGEMM_UNROLL_M@.c', + '_outcopy': f'generic/symm_ucopy_@SGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/symm_lcopy_@SGEMM_UNROLL_N@.c', + }, + 'd': { + '_iutcopy': f'generic/symm_ucopy_@DGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/symm_lcopy_@DGEMM_UNROLL_M@.c', + '_outcopy': f'generic/symm_ucopy_@DGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/symm_lcopy_@DGEMM_UNROLL_N@.c', + }, + 'c': { + '_iutcopy': f'generic/zsymm_ucopy_@CGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zsymm_lcopy_@CGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zsymm_ucopy_@CGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zsymm_lcopy_@CGEMM_UNROLL_N@.c', + }, + 'z': { + '_iutcopy': f'generic/zsymm_ucopy_@ZGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zsymm_lcopy_@ZGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zsymm_ucopy_@ZGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zsymm_lcopy_@ZGEMM_UNROLL_N@.c', + }, + }, + '?omatcopy_k': { + 's': { + '_cn': 'arm/omatcopy_cn.c', + '_rn': 'arm/omatcopy_rn.c', + '_ct': 'arm/omatcopy_ct.c', + '_rt': 'arm/omatcopy_rt.c', + }, + 'd': { + '_cn': 'arm/omatcopy_cn.c', + '_rn': 'arm/omatcopy_rn.c', + '_ct': 'arm/omatcopy_ct.c', + '_rt': 'arm/omatcopy_rt.c', + }, + 'c': { + '_cn': 'arm/zomatcopy_cn.c', + '_rn': 'arm/zomatcopy_rn.c', + '_ct': 'arm/zomatcopy_ct.c', + '_rt': 'arm/zomatcopy_rt.c', + '_cnc': 'arm/zomatcopy_cnc.c', + '_rnc': 'arm/zomatcopy_rnc.c', + '_ctc': 'arm/zomatcopy_ctc.c', + '_rtc': 'arm/zomatcopy_rtc.c', + }, + 'z': { + '_cn': 'arm/zomatcopy_cn.c', + '_rn': 'arm/zomatcopy_rn.c', + '_ct': 'arm/zomatcopy_ct.c', + '_rt': 'arm/zomatcopy_rt.c', + '_cnc': 'arm/zomatcopy_cnc.c', + '_rnc': 'arm/zomatcopy_rnc.c', + '_ctc': 'arm/zomatcopy_ctc.c', + '_rtc': 'arm/zomatcopy_rtc.c', + }, + }, + '?imatcopy_k': { + 's': { + '_cn': 'generic/imatcopy_cn.c', + '_rn': 'generic/imatcopy_rn.c', + '_ct': 'generic/imatcopy_ct.c', + '_rt': 'generic/imatcopy_rt.c', + }, + 'd': { + '_cn': 'generic/imatcopy_cn.c', + '_rn': 'generic/imatcopy_rn.c', + '_ct': 'generic/imatcopy_ct.c', + '_rt': 'generic/imatcopy_rt.c', + }, + 'c': { + '_cn': 'generic/zimatcopy_cn.c', + '_rn': 'generic/zimatcopy_rn.c', + '_ct': 'generic/zimatcopy_ct.c', + '_rt': 'generic/zimatcopy_rt.c', + '_cnc': 'generic/zimatcopy_cnc.c', + '_rnc': 'generic/zimatcopy_rnc.c', + '_ctc': 'generic/zimatcopy_ctc.c', + '_rtc': 'generic/zimatcopy_rtc.c', + }, + 'z': { + '_cn': 'generic/zimatcopy_cn.c', + '_rn': 'generic/zimatcopy_rn.c', + '_ct': 'generic/zimatcopy_ct.c', + '_rt': 'generic/zimatcopy_rt.c', + '_cnc': 'generic/zimatcopy_cnc.c', + '_rnc': 'generic/zimatcopy_rnc.c', + '_ctc': 'generic/zimatcopy_ctc.c', + '_rtc': 'generic/zimatcopy_rtc.c', + }, + }, + '?geadd': { + 's': { + '_k': 'generic/geadd.c', + }, + 'd': { + '_k': 'generic/geadd.c', + }, + 'c': { + '_k': 'generic/zgeadd.c', + }, + 'z': { + '_k': 'generic/zgeadd.c', + }, + }, + '?gemm_small_kernel': { + 's': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'd': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'c': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + 'z': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + }, + '?gemm_small_kernel_b0': { + 's': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'd': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'c': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + 'z': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + }, +} diff --git a/kernel/x86_64/meson.build b/kernel/x86_64/meson.build index e69de29bb..ff1180c7d 100644 --- a/kernel/x86_64/meson.build +++ b/kernel/x86_64/meson.build @@ -0,0 +1,385 @@ +x86_64_base_dict = { + # Level 1 BLAS + '?rot': { + 's': { + '_k': 'x86_64/rot_sse.S', + }, + 'd': { + '_k': 'x86_64/rot_sse2.S', + }, + 'cs': { + '_k': 'x86_64/zrot_sse.S', + }, + 'zd': { + '_k': 'x86_64/zrot_sse2.S', + } + }, + '?swap': { + 's': { + '_k': 'x86_64/swap_sse.S', + }, + 'd': { + '_k': 'x86_64/swap_sse2.S', + }, + 'c': { + '_k': 'x86_64/zswap_sse.S', + }, + 'z': { + '_k': 'x86_64/zswap_sse2.S', + }, + }, + '?scal': { + 's': { + '_k': 'x86_64/scal_sse.S', + }, + 'd': { + '_k': 'x86_64/scal_sse2.S', + }, + 'c': { + '_k': 'x86_64/zscal_sse.S', + }, + 'z': { + '_k': 'x86_64/zscal_sse2.S', + }, + }, + '?copy': { + 's': { + '_k': 'x86_64/copy_sse.S', + }, + 'd': { + '_k': 'x86_64/copy_sse2.S', + }, + 'c': { + '_k': 'x86_64/zcopy_sse.S', + }, + 'z': { + '_k': 'x86_64/zcopy_sse2.S', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/axpy_sse.S', + }, + 'd': { + '_k': 'x86_64/axpy_sse2.S', + }, + 'c': { + '_k': 'x86_64/zaxpy_sse.S', + }, + 'z': { + '_k': 'x86_64/zaxpy_sse2.S', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/zaxpy_sse.S', + }, + 'z': { + '_k': 'x86_64/zaxpy_sse2.S', + }, + }, + '?dot': { + 's': { + '_k': 'generic/dot.c', + }, + 'd': { + '_k': 'x86_64/dot_sse2.S', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/zdot_sse.S', + }, + 'z': { + '_k': 'x86_64/zdot_sse2.S', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/zdot_sse.S', + }, + 'z': { + '_k': 'x86_64/zdot_sse2.S', + }, + }, + '?dsdot': { + 's': { + '_k': 'generic/dot.c', + }, + '': { + '_k': 'generic/dot.c', + }, + }, + '?nrm2': { + 's': { + '_k': 'x86_64/nrm2_sse.S', + }, + 'd': { + '_k': 'x86_64/nrm2.S', + }, + 'c': { + '_k': 'x86_64/znrm2_sse.S', + }, + 'z': { + '_k': 'x86_64/znrm2.S', + }, + }, + '?asum': { + 's': { + '_k': 'x86_64/asum_sse.S', + }, + 'd': { + '_k': 'x86_64/asum_sse2.S', + }, + 'c': { + '_k': 'x86_64/zasum_sse.S', + }, + 'z': { + '_k': 'x86_64/zasum_sse2.S', + }, + }, + '?amax': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + 'c': { + '_k': 'x86_64/zamax_sse.S', + }, + 'z': { + '_k': 'x86_64/zamax_sse2.S', + }, + }, + '?sum': { + 's': { + '_k': 'arm/sum.c', + }, + 'd': { + '_k': 'arm/sum.c', + }, + 'c': { + '_k': 'x86_64/zsum_sse.S', + }, + 'z': { + '_k': 'x86_64/zsum_sse2.S', + }, + }, + '?amin': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + 'c': { + '_k': 'x86_64/zamax_sse.S', + }, + 'z': { + '_k': 'x86_64/zamax_sse2.S', + }, + }, + 'i?amax': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + 'c': { + '_k': 'x86_64/izamax_sse.S', + }, + 'z': { + '_k': 'x86_64/izamax_sse2.S', + }, + }, + 'i?amin': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + 'c': { + '_k': 'x86_64/izamax_sse.S', + }, + 'z': { + '_k': 'x86_64/izamax_sse2.S', + }, + }, + 'i?max': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + }, + 'i?min': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + }, + '?max': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + }, + '?min': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + }, + '?axpby': { + 's': { + '_k': 'arm/axpby.c', + }, + 'd': { + '_k': 'arm/axpby.c', + }, + 'c': { + '_k': 'arm/zaxpby.c', + }, + 'z': { + '_k': 'arm/zaxpby.c', + }, + }, + # Level 2 BLAS + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n.c', + '_t': 'x86_64/sgemv_t.c', + }, + 'd': { + '_n': 'x86_64/dgemv_n.S', + '_t': 'x86_64/dgemv_t_4.c', + }, + 'c': { + '_n': 'x86_64/cgemv_n_4.c', + '_t': 'x86_64/cgemv_t_4.c', + '_r': 'x86_64/cgemv_n_4.c', + '_c': 'x86_64/cgemv_t_4.c', + '_o': 'x86_64/cgemv_n_4.c', + '_u': 'x86_64/cgemv_t_4.c', + '_s': 'x86_64/cgemv_n_4.c', + '_d': 'x86_64/cgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + '_t': 'x86_64/zgemv_t_4.c', + '_r': 'x86_64/zgemv_n_4.c', + '_c': 'x86_64/zgemv_t_4.c', + '_o': 'x86_64/zgemv_n_4.c', + '_u': 'x86_64/zgemv_t_4.c', + '_s': 'x86_64/zgemv_n_4.c', + '_d': 'x86_64/zgemv_t_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/symv_U_sse.S', + '_L': 'x86_64/symv_L_sse.S', + }, + 'd': { + '_U': 'x86_64/symv_U_sse2.S', + '_L': 'x86_64/symv_L_sse2.S', + }, + 'c': { + '_U': 'generic/zsymv_k.c', + '_L': 'generic/zsymv_k.c', + }, + 'z': { + '_U': 'x86_64/zsymv_U_sse2.S', + '_L': 'x86_64/zsymv_L_sse2.S', + }, + }, + '?lsame': { + '': { + '': 'x86_64/lsame.S', + } + }, + '?cabs': { + 's': { + '1': 'x86_64/cabs.S', + }, + 'd': { + '1': 'x86_64/cabs.S', + }, + }, + '?gemm3m': { + + }, + '?hemv': { + 'z': { + '_U': 'x86_64/zsymv_U_sse2.S', + '_L': 'x86_64/zsymv_L_sse2.S', + }, + }, + # Level 3 BLAS + '?gemm_kernel': { + # done + }, + '?trmm_kernel': { + # done + }, + '?trsm_kernel': { + # done + }, + '?gemm': { + 's': { + '_beta': 'x86_64/gemm_beta.S', + '_small_matrix_permit': 'generic/gemm_small_matrix_permit.c', + }, + 'd': { + '_beta': 'x86_64/gemm_beta.S', + }, + 'c': { + '_beta': 'x86_64/zgemm_beta.S', + }, + 'z': { + '_beta': 'x86_64/zgemm_beta.S', + }, + }, + '?trmm': { + + }, + '?hemm': { + + }, + '?trsm': { + + }, + '?symm': { + + }, + '?omatcopy_k': { + + }, + '?imatcopy_k': { + + }, + '?geadd': { + + }, + '?gemm_small_kernel': { + + }, + '?gemm_small_kernel_b0': { + + }, +} + +subdir('meson_haswell') +subdir('meson_skylakex') +subdir('meson_zen') +subdir('meson_sandybridge') diff --git a/kernel/x86_64/meson_haswell/meson.build b/kernel/x86_64/meson_haswell/meson.build new file mode 100644 index 000000000..45ccce05c --- /dev/null +++ b/kernel/x86_64/meson_haswell/meson.build @@ -0,0 +1,238 @@ +x86_64_haswell_dict = { + '?scal': { + 's': { + '_k': 'x86_64/sscal.c', + }, + 'd': { + '_k': 'x86_64/dscal.c', + }, + 'c': { + '_k': 'x86_64/cscal.c', + }, + 'z': { + '_k': 'x86_64/zscal.c', + }, + }, + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n_4.c', + '_t': 'x86_64/sgemv_t_4.c', + }, + 'd': { + '_n': 'x86_64/dgemv_n_4.c', + '_t': 'x86_64/dgemv_t_4.c', + }, + 'c': { + '_n': 'x86_64/cgemv_n_4.c', + '_t': 'x86_64/cgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + '_t': 'x86_64/zgemv_t_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/ssymv_U.c', + '_L': 'x86_64/ssymv_L.c', + }, + 'd': { + '_U': 'x86_64/dsymv_U.c', + '_L': 'x86_64/dsymv_L.c', + }, + }, + '?dot': { + 's': { + '_k': 'x86_64/sdot.c', + }, + 'd': { + '_k': 'x86_64/ddot.c', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dsdot': { + '': { + '_k': 'x86_64/sdot.c', + }, + 's': { + '_k': 'x86_64/sdot.c', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/saxpy.c', + }, + 'd': { + '_k': 'x86_64/daxpy.c', + }, + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_8x4_haswell_2.c', + }, + 'd': { + '': 'x86_64/dgemm_kernel_4x8_haswell.S', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_l': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_r': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_b': 'x86_64/cgemm_kernel_8x2_haswell.c', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_l': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_r': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_b': 'x86_64/zgemm_kernel_4x2_haswell.c', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_LT': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RT': 'x86_64/sgemm_kernel_8x4_haswell.c', + }, + 'd': { + '_LN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_LT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + }, + 'c': { + '_LN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LC': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RC': 'x86_64/cgemm_kernel_8x2_haswell.S', + }, + 'z': { + '_LN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LC': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RC': 'x86_64/zgemm_kernel_4x2_haswell.S', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'x86_64/strsm_kernel_8x4_haswell_LN.c', + '_LT': 'x86_64/strsm_kernel_8x4_haswell_LT.c', + '_RN': 'x86_64/strsm_kernel_8x4_haswell_RN.c', + '_RT': 'x86_64/strsm_kernel_8x4_haswell_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'x86_64/dtrsm_kernel_RN_haswell.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?gemm': { + 's': { + '_beta': 'x86_64/sgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_8.c', + '_itcopy': 'generic/gemm_tcopy_8.c', + '_oncopy': 'x86_64/sgemm_ncopy_4_skylakex.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_beta': 'x86_64/dgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_4.c', + '_itcopy': 'generic/gemm_tcopy_4.c', + '_oncopy': 'x86_64/dgemm_ncopy_8_skylakex.c', + '_otcopy': 'generic/gemm_tcopy_8.c', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + 'z': { + '_incopy': 'generic/zgemm_ncopy_4.c', + '_itcopy': 'generic/zgemm_tcopy_4.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + }, + '?gemm3m': { + 'c': { + '_kernel': 'x86_64/cgemm3m_kernel_8x4_haswell.c', + }, + 'z': { + '_kernel': 'x86_64/zgemm3m_kernel_4x4_haswell.c', + }, + }, + '?asum': { + 's': { + '_k': 'x86_64/sasum.c', + }, + 'd': { + '_k': 'x86_64/dasum.c', + }, + }, + '?rot': { + 's': { + '_k': 'x86_64/srot.c', + }, + 'd': { + '_k': 'x86_64/drot.c', + }, + }, +} diff --git a/kernel/x86_64/meson_sandybridge/meson.build b/kernel/x86_64/meson_sandybridge/meson.build new file mode 100644 index 000000000..1c83a58fc --- /dev/null +++ b/kernel/x86_64/meson_sandybridge/meson.build @@ -0,0 +1,213 @@ +x86_64_sandybridge_dict = { + '?scal': { + 'd': { + '_k': 'x86_64/dscal.c', + }, + 'c': { + '_k': 'x86_64/cscal.c', + }, + }, + '?ger': { + 's': { + '_k': 'x86_64/sger.c', + }, + 'd': { + '_k': 'x86_64/dger.c', + }, + }, + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n_4.c', + '_t': 'x86_64/sgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/ssymv_U.c', + '_L': 'x86_64/ssymv_L.c', + }, + 'd': { + '_U': 'x86_64/dsymv_U.c', + '_L': 'x86_64/dsymv_L.c', + }, + }, + '?dot': { + 's': { + '_k': 'x86_64/sdot.c', + }, + 'd': { + '_k': 'x86_64/ddot.c', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dsdot': { + '': { + '_k': 'x86_64/sdot.c', + }, + 's': { + '_k': 'x86_64/sdot.c', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/saxpy.c', + }, + 'd': { + '_k': 'x86_64/daxpy.c', + }, + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_16x4_sandy.S', + }, + 'd': { + '': 'x86_64/dgemm_kernel_4x8_sandy.S', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_l': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_r': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_b': 'x86_64/cgemm_kernel_8x2_sandy.S', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_l': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_r': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_b': 'x86_64/zgemm_kernel_1x4_nehalem.S', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_16x4_sandy.S', + '_LT': 'x86_64/sgemm_kernel_16x4_sandy.S', + '_RN': 'x86_64/sgemm_kernel_16x4_sandy.S', + '_RT': 'x86_64/sgemm_kernel_16x4_sandy.S', + }, + 'd': { + '_LN': 'x86_64/dgemm_kernel_4x8_sandy.S', + '_LT': 'x86_64/dgemm_kernel_4x8_sandy.S', + '_RN': 'x86_64/dgemm_kernel_4x8_sandy.S', + '_RT': 'x86_64/dgemm_kernel_4x8_sandy.S', + }, + 'c': { + '_LN': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_LT': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_LR': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_LC': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RN': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RT': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RR': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RC': 'x86_64/cgemm_kernel_8x2_sandy.S', + }, + 'z': { + '_LN': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_LT': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_LR': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_LC': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RN': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RT': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RR': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RC': 'x86_64/zgemm_kernel_1x4_nehalem.S', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?gemm': { + 's': { + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'generic/gemm_tcopy_16.c', + '_oncopy': 'generic/gemm_ncopy_4.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_incopy': 'generic/gemm_ncopy_8.c', + '_itcopy': 'generic/gemm_tcopy_8.c', + '_oncopy': 'generic/gemm_ncopy_4.c', + '_otcopy': 'generic/gemm_tcopy_4.c', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + 'z': { + '_incopy': 'x86_64/zgemm_ncopy_1.S', + '_itcopy': 'x86_64/zgemm_tcopy_1.S', + '_oncopy': 'generic/zgemm_ncopy_4.c', + '_otcopy': 'generic/zgemm_tcopy_4.c' + }, + }, + '?gemm3m': { + 'c': { + '_kernel': 'x86_64/zgemm3m_kernel_4x8_nehalem.S', + }, + 'z': { + '_kernel': 'x86_64/zgemm3m_kernel_2x8_nehalem.S', + }, + }, +} diff --git a/kernel/x86_64/meson_skylakex/meson.build b/kernel/x86_64/meson_skylakex/meson.build new file mode 100644 index 000000000..8f28e6929 --- /dev/null +++ b/kernel/x86_64/meson_skylakex/meson.build @@ -0,0 +1,109 @@ +x86_64_skylakex_dict = { + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_16x4_skylakex_3.c', + }, + 'd': { + '': 'x86_64/dgemm_kernel_16x2_skylakex.c', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_skylakex.c', + '_l': 'x86_64/cgemm_kernel_8x2_skylakex.c', + '_r': 'x86_64/cgemm_kernel_8x2_skylakex.c', + '_b': 'x86_64/cgemm_kernel_8x2_skylakex.c', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_4x2_skylakex.c', + '_l': 'x86_64/zgemm_kernel_4x2_skylakex.c', + '_r': 'x86_64/zgemm_kernel_4x2_skylakex.c', + '_b': 'x86_64/zgemm_kernel_4x2_skylakex.c', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + '_LT': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + '_RN': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + '_RT': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + }, + 'd': { + '_LN': 'x86_64/dgemm_kernel_16x2_skylakex.c', + '_LT': 'x86_64/dgemm_kernel_16x2_skylakex.c', + '_RN': 'x86_64/dgemm_kernel_16x2_skylakex.c', + '_RT': 'x86_64/dgemm_kernel_16x2_skylakex.c', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'd': { + '_RN': 'generic/trsm_kernel_RN.c', + }, + }, + '?gemm_small_kernel': { + 's': { + '_nn': 'x86_64/sgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/sgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/sgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/sgemm_small_kernel_tt_skylakex.c', + }, + 'd': { + '_nn': 'x86_64/dgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/dgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/dgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/dgemm_small_kernel_tt_skylakex.c', + }, + }, + '?gemm_small_kernel_b0': { + 's': { + '_nn': 'x86_64/sgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/sgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/sgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/sgemm_small_kernel_tt_skylakex.c', + }, + 'd': { + '_nn': 'x86_64/dgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/dgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/dgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/dgemm_small_kernel_tt_skylakex.c', + }, + }, + '?gemm': { + 's': { + '_small_matrix_permit': 'x86_64/sgemm_small_kernel_permit_skylakex.c', + '_beta': 'x86_64/sgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'x86_64/sgemm_tcopy_16_skylakex.c', + '_oncopy': 'x86_64/sgemm_ncopy_4_skylakex.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_small_matrix_permit': 'x86_64/dgemm_small_kernel_permit_skylakex.c', + '_beta': 'x86_64/dgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'x86_64/dgemm_tcopy_16_skylakex.c', + '_oncopy': 'generic/gemm_ncopy_2.c', + '_otcopy': 'generic/gemm_tcopy_2.c', + }, + }, + '?asum': { + 'c': { + '_k': 'x86_64/casum.c', + }, + 'z': { + '_k': 'x86_64/zasum.c', + }, + }, + '?sum': { + 'c': { + '_k': 'x86_64/csum.c', + }, + 'z': { + '_k': 'x86_64/zsum.c', + }, + }, +} diff --git a/kernel/x86_64/meson_zen/meson.build b/kernel/x86_64/meson_zen/meson.build new file mode 100644 index 000000000..ff4ce2f7a --- /dev/null +++ b/kernel/x86_64/meson_zen/meson.build @@ -0,0 +1,228 @@ +x86_64_zen_dict = { + '?scal': { + 's': { + '_k': 'x86_64/sscal.c', + }, + 'd': { + '_k': 'x86_64/dscal.c', + }, + 'c': { + '_k': 'x86_64/cscal.c', + }, + 'z': { + '_k': 'x86_64/zscal.c', + }, + }, + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n_4.c', + '_t': 'x86_64/sgemv_t_4.c', + }, + 'd': { + '_n': 'x86_64/dgemv_n_4.c', + '_t': 'x86_64/dgemv_t_4.c', + }, + 'c': { + '_n': 'x86_64/cgemv_n_4.c', + '_t': 'x86_64/cgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + '_t': 'x86_64/zgemv_t_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/ssymv_U.c', + '_L': 'x86_64/ssymv_L.c', + }, + 'd': { + '_U': 'x86_64/dsymv_U.c', + '_L': 'x86_64/dsymv_L.c', + }, + }, + '?dot': { + 's': { + '_k': 'x86_64/sdot.c', + }, + 'd': { + '_k': 'x86_64/ddot.c', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dsdot': { + '': { + '_k': 'x86_64/sdot.c', + }, + 's': { + '_k': 'x86_64/sdot.c', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/saxpy.c', + }, + 'd': { + '_k': 'x86_64/daxpy.c', + }, + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_8x4_haswell_2.c', + }, + 'd': { + '': 'x86_64/dgemm_kernel_4x8_haswell.S', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_l': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_r': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_b': 'x86_64/cgemm_kernel_8x2_haswell.c', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_l': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_r': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_b': 'x86_64/zgemm_kernel_4x2_haswell.c', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_LT': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RT': 'x86_64/sgemm_kernel_8x4_haswell.c', + }, + 'd': { + '_LN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_LT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + }, + 'c': { + '_LN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LC': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RC': 'x86_64/cgemm_kernel_8x2_haswell.S', + }, + 'z': { + '_LN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LC': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RC': 'x86_64/zgemm_kernel_4x2_haswell.S', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'x86_64/strsm_kernel_8x4_haswell_LN.c', + '_LT': 'x86_64/strsm_kernel_8x4_haswell_LT.c', + '_RN': 'x86_64/strsm_kernel_8x4_haswell_RN.c', + '_RT': 'x86_64/strsm_kernel_8x4_haswell_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'x86_64/dtrsm_kernel_RN_haswell.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?gemm': { + 's': { + '_incopy': 'generic/gemm_ncopy_8.c', + '_itcopy': 'generic/gemm_tcopy_8.c', + '_oncopy': 'generic/gemm_ncopy_4.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_incopy': 'generic/gemm_ncopy_4.c', + '_itcopy': 'generic/gemm_tcopy_4.c', + '_oncopy': 'generic/gemm_ncopy_8.c', + '_otcopy': 'generic/gemm_tcopy_8.c', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + 'z': { + '_incopy': 'generic/zgemm_ncopy_4.c', + '_itcopy': 'generic/zgemm_tcopy_4.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + }, + '?gemm3m': { + 'c': { + '_kernel': 'x86_64/cgemm3m_kernel_8x4_haswell.c', + }, + 'z': { + '_kernel': 'x86_64/zgemm3m_kernel_4x4_haswell.c', + }, + }, + '?rot': { + 's': { + '_k': 'x86_64/srot.c', + }, + 'd': { + '_k': 'x86_64/drot.c', + }, + }, +} diff --git a/meson.build b/meson.build index ef6770e1a..54aec362c 100644 --- a/meson.build +++ b/meson.build @@ -463,16 +463,23 @@ symb_defs = { # config.h file generation _config_h = meson.current_build_dir() / 'config.h' -run_command('./c_check', 'Makefile.conf', _config_h, cc_id, check: true) -run_command('./f_check', 'Makefile.conf', _config_h, fc_id, check: true) +_makefile_conf = meson.current_build_dir() / 'Makefile.conf' +run_command('./c_check', _makefile_conf, _config_h, cc_id, check: true) +run_command('./f_check', _makefile_conf, _config_h, fc_id, check: true) run_command(cc_id, '-o', 'getarch', 'getarch.c', 'cpuid.S', check: true) -_getarch_result = run_command('./getarch', '1', check: true, capture: true) +_getarch_1_result = run_command('./getarch', '1', check: true, capture: true) run_command(py3, './write_to_file.py', - _getarch_result.stdout(), + _getarch_1_result.stdout(), _config_h, check: true) +_getarch_0_result = run_command('./getarch', '0', check: true, capture: true) +run_command(py3, + './write_to_file.py', + _getarch_0_result.stdout(), + _makefile_conf, + check: true) run_command(cc_id, '-DGEMM_MULTITHREAD_THRESHOLD=4', @@ -481,12 +488,44 @@ run_command(cc_id, '-o', 'getarch_2nd', 'getarch_2nd.c', capture: true, check: true) -_getarch_2nd_result = run_command('./getarch_2nd', '1', check: true, capture: true) +_getarch_2nd_1_result = run_command('./getarch_2nd', '1', check: true, capture: true) run_command(py3, './write_to_file.py', - _getarch_2nd_result.stdout(), + _getarch_2nd_1_result.stdout(), _config_h, check: true) +_getarch_2nd_0_result = run_command('./getarch_2nd', '0', check: true, capture: true) +run_command(py3, + './write_to_file.py', + _getarch_2nd_0_result.stdout(), + _makefile_conf, + check: true) + + +_read_config_py = './read_config.py' + +run_command(py3, + _read_config_py, + _config_h, + check: true) + +keyval = import('keyval') +conf_kv = keyval.load(meson.current_build_dir() / 'config.kconf') +# NOTE(rg): conf_kv doesn't do any parsing, setup manually +conf_hdat = configuration_data() +foreach key,val : conf_kv + if 'CHAR' in key + conf_hdat.set_quoted(key, val) + else + conf_hdat.set(key, val) + endif +endforeach + +makefile_conf_kv = keyval.load(meson.current_build_dir() / 'Makefile.conf') +makefile_conf_dat = configuration_data() +foreach key,val : makefile_conf_kv + makefile_conf_dat.set(key, val) +endforeach # Ignoring other hostarch checks and conflicts for arch in BSD for now _inc = [include_directories('.')] diff --git a/read_config.py b/read_config.py new file mode 100644 index 000000000..acb8abb84 --- /dev/null +++ b/read_config.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import argparse + +def read_config_file(file_path: str) -> dict: + config_data = {} + with open(file_path, "r") as file: + lines = file.readlines() + for line in lines: + line = line.strip() + if line.startswith("#define"): + parts = line.split() + key = parts[1] + if len(parts) == 3: + value = parts[2] + if value.isdigit(): + value = int(value) + elif value.startswith('"') and value.endswith('"'): + value = value.strip('"') + config_data[key] = value + elif len(parts) == 2: + config_data[key] = 1 + return config_data + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Read a config.h file.") + parser.add_argument("file1", help="Path to the config.h file.") + + args = parser.parse_args() + + config_data = read_config_file(args.file1) + fdat = [] + for key, value in config_data.items(): + fdat.append(f'{key}={value}') + result = '\n'.join(fdat) + + f = open("./build/config.kconf", "a") + f.write(result) + f.close() diff --git a/test/meson.build b/test/meson.build index 80e6450a4..71f620de2 100644 --- a/test/meson.build +++ b/test/meson.build @@ -8,13 +8,17 @@ _test_input_array = [ {'id': 'cblat1', 'has_dat': false}, {'id': 'cblat2', 'has_dat': true}, {'id': 'cblat3', 'has_dat': true}, - {'id': 'cblat3_3m', 'has_dat': true}, {'id': 'zblat1', 'has_dat': false}, {'id': 'zblat2', 'has_dat': true}, {'id': 'zblat3', 'has_dat': true}, - {'id': 'zblat3_3m', 'has_dat': true}, ] +if conf_hdat.has('ARCH_X86_64') or conf_hdat.has('ARCH_X86') + _test_input_array += [ + {'id': 'cblat3_3m', 'has_dat': true}, {'id': 'zblat3_3m', 'has_dat': true}, + ] +endif + _test_runner = executable('test_runner', sources: ['test_runner.c'], install: false) foreach _test : _test_input_array