diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml new file mode 100644 index 000000000..04befefa9 --- /dev/null +++ b/.github/workflows/codspeed-bench.yml @@ -0,0 +1,150 @@ +name: Run codspeed benchmarks + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + benchmarks: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + fortran: [gfortran] + build: [make] + pyver: ["3.12"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.pyver }} + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update + sudo apt-get install -y gfortran cmake ccache libtinfo5 + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Write out the .pc + run: | + cd benchmark/pybench + cat > openblas.pc << EOF + libdir=${{ github.workspace }} + includedir= ${{ github.workspace }} + openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64 + version=0.0.99 + extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas + Name: openblas + Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version + Version: ${version} + URL: https://github.com/xianyi/OpenBLAS + Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }} + Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas + Cflags: -I${{ github.workspace}} + EOF + cat openblas.pc + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + elif [ "$RUNNER_OS" = "macOS" ]; then + echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Install benchmark dependencies + run: pip install meson ninja numpy pytest pytest-codspeed --user + + - name: Build the wrapper + run: | + cd benchmark/pybench + export PKG_CONFIG_PATH=$PWD + meson setup build --prefix=$PWD/build-install + meson install -C build + # + # sanity check + cd build/openblas_wrap + python -c'import _flapack; print(dir(_flapack))' + + - name: Run benchmarks + uses: CodSpeedHQ/action@v2 + with: + token: ${{ secrets.CODSPEED_TOKEN }} + run: | + cd benchmark/pybench + export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/ + OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed + diff --git a/.gitignore b/.gitignore index dc6804f1e..8294da4d4 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ benchmark/smallscaling CMakeCache.txt CMakeFiles/* .vscode +**/__pycache__ diff --git a/benchmark/pybench/benchmarks/bench_blas.py b/benchmark/pybench/benchmarks/bench_blas.py new file mode 100644 index 000000000..064be1ead --- /dev/null +++ b/benchmark/pybench/benchmarks/bench_blas.py @@ -0,0 +1,185 @@ +import pytest +import numpy as np +from openblas_wrap import ( + # level 1 + dnrm2, ddot, daxpy, + # level 3 + dgemm, dsyrk, + # lapack + dgesv, # linalg.solve + dgesdd, dgesdd_lwork, # linalg.svd + dsyev, dsyev_lwork, # linalg.eigh +) + +# ### BLAS level 1 ### + +# dnrm2 + +dnrm2_sizes = [100, 1000] + +def run_dnrm2(n, x, incx): + res = dnrm2(x, n, incx=incx) + return res + + +@pytest.mark.parametrize('n', dnrm2_sizes) +def test_nrm2(benchmark, n): + rndm = np.random.RandomState(1234) + x = np.array(rndm.uniform(size=(n,)), dtype=float) + result = benchmark(run_dnrm2, n, x, 1) + + +# ddot + +ddot_sizes = [100, 1000] + +def run_ddot(x, y,): + res = ddot(x, y) + return res + + +@pytest.mark.parametrize('n', ddot_sizes) +def test_dot(benchmark, n): + rndm = np.random.RandomState(1234) + x = np.array(rndm.uniform(size=(n,)), dtype=float) + y = np.array(rndm.uniform(size=(n,)), dtype=float) + result = benchmark(run_ddot, x, y) + + +# daxpy + +daxpy_sizes = [100, 1000] + +def run_daxpy(x, y,): + res = daxpy(x, y, a=2.0) + return res + + +@pytest.mark.parametrize('n', daxpy_sizes) +def test_daxpy(benchmark, n): + rndm = np.random.RandomState(1234) + x = np.array(rndm.uniform(size=(n,)), dtype=float) + y = np.array(rndm.uniform(size=(n,)), dtype=float) + result = benchmark(run_daxpy, x, y) + + + + +# ### BLAS level 3 ### + +# dgemm + +gemm_sizes = [100, 1000] + +def run_gemm(a, b, c): + alpha = 1.0 + res = dgemm(alpha, a, b, c=c, overwrite_c=True) + return res + + +@pytest.mark.parametrize('n', gemm_sizes) +def test_gemm(benchmark, n): + rndm = np.random.RandomState(1234) + a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + b = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + c = np.empty((n, n), dtype=float, order='F') + result = benchmark(run_gemm, a, b, c) + assert result is c + + +# dsyrk + +syrk_sizes = [100, 1000] + + +def run_syrk(a, c): + res = dsyrk(1.0, a, c=c, overwrite_c=True) + return res + + +@pytest.mark.parametrize('n', syrk_sizes) +def test_syrk(benchmark, n): + rndm = np.random.RandomState(1234) + a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + c = np.empty((n, n), dtype=float, order='F') + result = benchmark(run_syrk, a, c) + assert result is c + + +# ### LAPACK ### + +# linalg.solve + +gesv_sizes = [100, 1000] + + +def run_gesv(a, b): + res = dgesv(a, b, overwrite_a=True, overwrite_b=True) + return res + + +@pytest.mark.parametrize('n', gesv_sizes) +def test_gesv(benchmark, n): + rndm = np.random.RandomState(1234) + a = (np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + + np.eye(n, order='F')) + b = np.array(rndm.uniform(size=(n, 1)), order='F') + lu, piv, x, info = benchmark(run_gesv, a, b) + assert lu is a + assert x is b + assert info == 0 + + +# linalg.svd + +gesdd_sizes = [(100, 5), (1000, 222)] + + +def run_gesdd(a, lwork): + res = dgesdd(a, lwork=lwork, full_matrices=False, overwrite_a=False) + return res + + +@pytest.mark.parametrize('mn', gesdd_sizes) +def test_gesdd(benchmark, mn): + m, n = mn + rndm = np.random.RandomState(1234) + a = np.array(rndm.uniform(size=(m, n)), dtype=float, order='F') + + lwork, info = dgesdd_lwork(m, n) + lwork = int(lwork) + assert info == 0 + + u, s, vt, info = benchmark(run_gesdd, a, lwork) + + assert info == 0 + np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=1e-13) + + +# linalg.eigh + +syev_sizes = [50, 200] + + +def run_syev(a, lwork): + res = dsyev(a, lwork=lwork, overwrite_a=True) + return res + + +@pytest.mark.parametrize('n', syev_sizes) +def test_syev(benchmark, n): + rndm = np.random.RandomState(1234) + a = rndm.uniform(size=(n, n)) + a = np.asarray(a + a.T, dtype=float, order='F') + a_ = a.copy() + + lwork, info = dsyev_lwork(n) + lwork = int(lwork) + assert info == 0 + + w, v, info = benchmark(run_syev, a, lwork) + + assert info == 0 + assert a is v # overwrite_a=True + + diff --git a/benchmark/pybench/meson.build b/benchmark/pybench/meson.build new file mode 100644 index 000000000..5d921c9ed --- /dev/null +++ b/benchmark/pybench/meson.build @@ -0,0 +1,48 @@ +# +# Taken from SciPy (of course) +# +project( + 'openblas-wrap', + 'c', 'fortran', + version: '0.1', + license: 'BSD-3', + meson_version: '>= 1.1.0', + default_options: [ + 'buildtype=debugoptimized', + 'b_ndebug=if-release', + 'c_std=c17', + 'fortran_std=legacy', + ], +) + +py3 = import('python').find_installation(pure: false) +py3_dep = py3.dependency() + +cc = meson.get_compiler('c') + +_global_c_args = cc.get_supported_arguments( + '-Wno-unused-but-set-variable', + '-Wno-unused-function', + '-Wno-conversion', + '-Wno-misleading-indentation', +) +add_project_arguments(_global_c_args, language : 'c') + +# We need -lm for all C code (assuming it uses math functions, which is safe to +# assume for SciPy). For C++ it isn't needed, because libstdc++/libc++ is +# guaranteed to depend on it. For Fortran code, Meson already adds `-lm`. +m_dep = cc.find_library('m', required : false) +if m_dep.found() + add_project_link_arguments('-lm', language : 'c') +endif + +generate_f2pymod = find_program('openblas_wrap/generate_f2pymod.py') + +openblas = dependency('openblas', method: 'pkg-config', required: true) +openblas_dep = declare_dependency( + dependencies: openblas, + compile_args: [] +) + + +subdir('openblas_wrap') diff --git a/benchmark/pybench/openblas_wrap/__init__.py b/benchmark/pybench/openblas_wrap/__init__.py new file mode 100644 index 000000000..06e16a665 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/__init__.py @@ -0,0 +1,28 @@ +""" +Trampoline to hide the LAPACK details (scipy.lapack.linalg or scipy_openblas32 or...) +from benchmarking. +""" + +__version__ = "0.1" + + +#from scipy.linalg.blas import ( +from ._flapack import ( + # level 1 + dnrm2 as dnrm2, + ddot as ddot, + daxpy as daxpy, + # level 3 + dgemm as dgemm, + dsyrk as dsyrk, +) + +#from scipy.linalg.lapack import ( +from openblas_wrap._flapack import ( + # linalg.solve + dgesv as dgesv, + # linalg.svd + dgesdd as dgesdd, dgesdd_lwork as dgesdd_lwork, + # linalg.eigh + dsyev as dsyev, dsyev_lwork as dsyev_lwork +) diff --git a/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src new file mode 100644 index 000000000..d2d94baa0 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src @@ -0,0 +1,326 @@ +! +! Taken from scipy/linalg +! +! Shorthand notations +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! Level 1 BLAS +! + + +python module _flapack + usercode ''' +#define F_INT int +''' + +interface + + +subroutine axpy(n,a,x,offx,incx,y,offy,incy) + ! Calculate z = a*x+y, where a is scalar. + + callstatement (*f2py_func)(&n,&a,x+offx,&incx,y+offy,&incy) + callprotoargument F_INT*,*,*,F_INT*,*,F_INT* + + dimension(*), intent(in) :: x + dimension(*), intent(in,out,out=z) :: y + optional, intent(in):: a=<1.0,\0,(1.0\,0.0),\2> + integer optional, intent(in),check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in),check(incy>0||incy<0) :: incy = 1 + integer optional, intent(in),depend(x) :: offx=0 + integer optional, intent(in),depend(y) :: offy=0 + check(offx>=0 && offx=0 && offy(n-1)*abs(incx)) :: n + check(len(y)-offy>(n-1)*abs(incy)) :: n + +end subroutine axpy + +function ddot(n,x,offx,incx,y,offy,incy) result (xy) + ! Computes a vector-vector dot product. + + callstatement ddot_return_value = (*f2py_func)(&n,x+offx,&incx,y+offy,&incy) + callprotoargument F_INT*,double*,F_INT*,double*,F_INT* + intent(c) ddot + fortranname F_FUNC(ddot,DDOT) + + double precision dimension(*), intent(in) :: x + double precision dimension(*), intent(in) :: y + double precision ddot,xy + integer optional, intent(in),check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in),check(incy>0||incy<0) :: incy = 1 + integer optional, intent(in),depend(x) :: offx=0 + integer optional, intent(in),depend(y) :: offy=0 + check(offx>=0 && offx=0 && offy(n-1)*abs(incx)) :: n + check(len(y)-offy>(n-1)*abs(incy)) :: n + +end function ddot + + +function nrm2(n,x,offx,incx) result(n2) + + nrm2, n2 + + callstatement nrm2_return_value = (*f2py_func)(&n,x+offx,&incx) + callprotoargument F_INT*,*,F_INT* + intent(c) nrm2 + fortranname F_FUNC(nrm2,NRM2) + + dimension(*),intent(in) :: x + + integer optional, intent(in),check(incx>0) :: incx = 1 + + integer optional,intent(in),depend(x) :: offx=0 + check(offx>=0 && offx(n-1)*abs(incx)) :: n + +end function nrm2 + +! +! Level 3 BLAS +! + + +subroutine gemm(m,n,k,alpha,a,b,beta,c,trans_a,trans_b,lda,ka,ldb,kb) + ! Computes a scalar-matrix-matrix product and adds the result to a + ! scalar-matrix product. + ! + ! c = gemm(alpha,a,b,beta=0,c=0,trans_a=0,trans_b=0,overwrite_c=0) + ! Calculate C <- alpha * op(A) * op(B) + beta * C + + callstatement (*f2py_func)((trans_a?(trans_a==2?"C":"T"):"N"), & + (trans_b?(trans_b==2?"C":"T"):"N"),&m,&n,&k,&alpha,a,&lda,b,&ldb,&beta,c,&m) + callprotoargument char*,char*,F_INT*,F_INT*,F_INT*,*,*,F_INT*,*, & + F_INT*,*,*,F_INT* + + integer optional,intent(in),check(trans_a>=0 && trans_a <=2) :: trans_a = 0 + integer optional,intent(in),check(trans_b>=0 && trans_b <=2) :: trans_b = 0 + intent(in) :: alpha + intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2> + + dimension(lda,ka),intent(in) :: a + dimension(ldb,kb),intent(in) :: b + dimension(m,n),intent(in,out,copy),depend(m,n),optional :: c + check(shape(c,0)==m && shape(c,1)==n) :: c + + integer depend(a),intent(hide) :: lda = shape(a,0) + integer depend(a),intent(hide) :: ka = shape(a,1) + integer depend(b),intent(hide) :: ldb = shape(b,0) + integer depend(b),intent(hide) :: kb = shape(b,1) + + integer depend(a,trans_a,ka,lda),intent(hide):: m = (trans_a?ka:lda) + integer depend(a,trans_a,ka,lda),intent(hide):: k = (trans_a?lda:ka) + integer depend(b,trans_b,kb,ldb,k),intent(hide),check(trans_b?kb==k:ldb==k) :: & + n = (trans_b?ldb:kb) + +end subroutine gemm + + +subroutine rk(n,k,alpha,a,beta,c,trans,lower,lda,ka) + ! performs one of the symmetric rank k operations + ! C := alpha*A*A**T + beta*C, or C := alpha*A**T*A + beta*C, + ! + ! c = syrk(alpha,a,beta=0,c=0,trans=0,lower=0,overwrite_c=0) + ! + callstatement (*f2py_func)((lower?"L":"U"), & + (trans?(trans==2?"C":"T"):"N"), &n,&k,&alpha,a,&lda,&beta,c,&n) + callprotoargument char*,char*,F_INT*,F_INT*,*,*,F_INT*,*, & + *,F_INT* + + integer optional, intent(in),check(lower==0||lower==1) :: lower = 0 + integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0 + + intent(in) :: alpha + intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2,\2,\2> + + dimension(lda,ka),intent(in) :: a + dimension(n,n),intent(in,out,copy),depend(n),optional :: c + check(shape(c,0)==n && shape(c,1)==n) :: c + + integer depend(a),intent(hide) :: lda = shape(a,0) + integer depend(a),intent(hide) :: ka = shape(a,1) + + integer depend(a, trans, ka, lda), intent(hide) :: n = (trans ? ka : lda) + integer depend(a, trans, ka, lda), intent(hide) :: k = (trans ? lda : ka) + +end subroutine rk + + +! +! LAPACK +! + +subroutine gesv(n,nrhs,a,piv,b,info) + ! lu,piv,x,info = gesv(a,b,overwrite_a=0,overwrite_b=0) + ! Solve A * X = B. + ! A = P * L * U + ! U is upper diagonal triangular, L is unit lower triangular, + ! piv pivots columns. + + callstatement {F_INT i;(*f2py_func)(&n,&nrhs,a,&n,piv,b,&n,&info);for(i=0;i\*,F_INT*,F_INT*,*,F_INT*,F_INT* + + integer depend(a),intent(hide):: n = shape(a,0) + integer depend(b),intent(hide):: nrhs = shape(b,1) + dimension(n,n),check(shape(a,0)==shape(a,1)) :: a + integer dimension(n),depend(n),intent(out) :: piv + dimension(n,nrhs),check(shape(a,0)==shape(b,0)),depend(n) :: b + integer intent(out)::info + intent(in,out,copy,out=x) b + intent(in,out,copy,out=lu) a +end subroutine gesv + + +subroutine gesdd(m,n,minmn,u0,u1,vt0,vt1,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info) + ! u,s,vt,info = gesdd(a,compute_uv=1,lwork=..,overwrite_a=0) + ! Compute the singular value decomposition (SVD) using divide and conquer: + ! A = U * SIGMA * transpose(V) + ! A - M x N matrix + ! U - M x M matrix or min(M,N) x N if full_matrices=False + ! SIGMA - M x N zero matrix with a main diagonal filled with min(M,N) + ! singular values + ! transpose(V) - N x N matrix or N x min(M,N) if full_matrices=False + + callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,a,&m,s,u,&u0,vt,&vt0,work,&lwork,iwork,&info) + callprotoargument char*,F_INT*,F_INT*,*,F_INT*,*,*,F_INT*,*,F_INT*,*,F_INT*,F_INT*,F_INT* + + integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1 + integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1 + integer intent(hide),depend(a):: m = shape(a,0) + integer intent(hide),depend(a):: n = shape(a,1) + integer intent(hide),depend(m,n):: minmn = MIN(m,n) + integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: u1 = (compute_uv?(full_matrices?m:minmn):1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1) + integer intent(hide),depend(compute_uv,minmn) :: vt1 = (compute_uv?n:1) + dimension(m,n),intent(in,copy,aligned8) :: a + dimension(minmn),intent(out),depend(minmn) :: s + dimension(u0,u1),intent(out),depend(u0, u1) :: u + dimension(vt0,vt1),intent(out),depend(vt0, vt1) :: vt + dimension(lwork),intent(hide,cache),depend(lwork) :: work + integer optional,intent(in),depend(minmn,compute_uv) & + :: lwork = max((compute_uv?4*minmn*minmn+MAX(m,n)+9*minmn:MAX(14*minmn+4,10*minmn+2+25*(25+8))+MAX(m,n)),1) + integer intent(hide,cache),dimension(8*minmn),depend(minmn) :: iwork + integer intent(out)::info + +end subroutine gesdd + +subroutine gesdd_lwork(m,n,minmn,u0,vt0,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info) + ! LWORK computation for (S/D)GESDD + + fortranname gesdd + callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,&a,&m,&s,&u,&u0,&vt,&vt0,&work,&lwork,&iwork,&info) + callprotoargument char*,F_INT*,F_INT*,*,F_INT*,*,*,F_INT*,*,F_INT*,*,F_INT*,F_INT*,F_INT* + + integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1 + integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1 + integer intent(in) :: m + integer intent(in) :: n + integer intent(hide),depend(m,n):: minmn = MIN(m,n) + integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1) + intent(hide) :: a + intent(hide) :: s + intent(hide) :: u + intent(hide) :: vt + intent(out) :: work + integer intent(hide) :: lwork = -1 + integer intent(hide) :: iwork + integer intent(out) :: info + +end subroutine gesdd_lwork + + +subroutine syev(compute_v,lower,n,w,a,lda,work,lwork,info) + ! w,v,info = syev(a,compute_v=1,lower=0,lwork=3*n-1,overwrite_a=0) + ! Compute all eigenvalues and, optionally, eigenvectors of a + ! real symmetric matrix A. + ! + ! Performance tip: + ! If compute_v=0 then set also overwrite_a=1. + + callstatement (*f2py_func)((compute_v?"V":"N"),(lower?"L":"U"),&n,a,&lda,w,work,&lwork,&info) + callprotoargument char*,char*,F_INT*,*,F_INT*,*,*,F_INT*,F_INT* + + integer optional,intent(in):: compute_v = 1 + check(compute_v==1||compute_v==0) compute_v + integer optional,intent(in),check(lower==0||lower==1) :: lower = 0 + + integer intent(hide),depend(a):: n = shape(a,0) + integer intent(hide),depend(a):: lda = MAX(1,shape(a,0)) + dimension(n,n),check(shape(a,0)==shape(a,1)) :: a + intent(in,copy,out,out=v) :: a + + dimension(n),intent(out),depend(n) :: w + + integer optional,intent(in),depend(n) :: lwork=max(3*n-1,1) + check(lwork>=3*n-1) :: lwork + dimension(lwork),intent(hide),depend(lwork) :: work + + integer intent(out) :: info + +end subroutine syev + + +subroutine syev_lwork(lower,n,w,a,lda,work,lwork,info) + ! LWORK routines for syev + + fortranname syev + + callstatement (*f2py_func)("N",(lower?"L":"U"),&n,&a,&lda,&w,&work,&lwork,&info) + callprotoargument char*,char*,F_INT*,*,F_INT*,*,*,F_INT*,F_INT* + + integer intent(in):: n + integer optional,intent(in),check(lower==0||lower==1) :: lower = 0 + + integer intent(hide),depend(n):: lda = MAX(1, n) + intent(hide):: a + intent(hide):: w + integer intent(hide):: lwork = -1 + + intent(out):: work + integer intent(out):: info + +end subroutine syev_lwork + +end interface + +end python module _flapack + + + diff --git a/benchmark/pybench/openblas_wrap/generate_f2pymod.py b/benchmark/pybench/openblas_wrap/generate_f2pymod.py new file mode 100644 index 000000000..5a8ba1389 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/generate_f2pymod.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Process f2py template files (`filename.pyf.src` -> `filename.pyf`) + +Usage: python generate_pyf.py filename.pyf.src -o filename.pyf +""" + +import os +import sys +import re +import subprocess +import argparse + + +# START OF CODE VENDORED FROM `numpy.distutils.from_template` +############################################################# +""" +process_file(filename) + + takes templated file .xxx.src and produces .xxx file where .xxx + is .pyf .f90 or .f using the following template rules: + + '<..>' denotes a template. + + All function and subroutine blocks in a source file with names that + contain '<..>' will be replicated according to the rules in '<..>'. + + The number of comma-separated words in '<..>' will determine the number of + replicates. + + '<..>' may have two different forms, named and short. For example, + + named: + where anywhere inside a block '

' will be replaced with + 'd', 's', 'z', and 'c' for each replicate of the block. + + <_c> is already defined: <_c=s,d,c,z> + <_t> is already defined: <_t=real,double precision,complex,double complex> + + short: + , a short form of the named, useful when no

appears inside + a block. + + In general, '<..>' contains a comma separated list of arbitrary + expressions. If these expression must contain a comma|leftarrow|rightarrow, + then prepend the comma|leftarrow|rightarrow with a backslash. + + If an expression matches '\\' then it will be replaced + by -th expression. + + Note that all '<..>' forms in a block must have the same number of + comma-separated entries. + + Predefined named template rules: + + + + + +""" + +routine_start_re = re.compile( + r'(\n|\A)(( (\$|\*))|)\s*(subroutine|function)\b', + re.I +) +routine_end_re = re.compile(r'\n\s*end\s*(subroutine|function)\b.*(\n|\Z)', re.I) +function_start_re = re.compile(r'\n (\$|\*)\s*function\b', re.I) + +def parse_structure(astr): + """ Return a list of tuples for each function or subroutine each + tuple is the start and end of a subroutine or function to be + expanded. + """ + + spanlist = [] + ind = 0 + while True: + m = routine_start_re.search(astr, ind) + if m is None: + break + start = m.start() + if function_start_re.match(astr, start, m.end()): + while True: + i = astr.rfind('\n', ind, start) + if i==-1: + break + start = i + if astr[i:i+7]!='\n $': + break + start += 1 + m = routine_end_re.search(astr, m.end()) + ind = end = m and m.end()-1 or len(astr) + spanlist.append((start, end)) + return spanlist + +template_re = re.compile(r"<\s*(\w[\w\d]*)\s*>") +named_re = re.compile(r"<\s*(\w[\w\d]*)\s*=\s*(.*?)\s*>") +list_re = re.compile(r"<\s*((.*?))\s*>") + +def find_repl_patterns(astr): + reps = named_re.findall(astr) + names = {} + for rep in reps: + name = rep[0].strip() or unique_key(names) + repl = rep[1].replace(r'\,', '@comma@') + thelist = conv(repl) + names[name] = thelist + return names + +def find_and_remove_repl_patterns(astr): + names = find_repl_patterns(astr) + astr = re.subn(named_re, '', astr)[0] + return astr, names + +item_re = re.compile(r"\A\\(?P\d+)\Z") +def conv(astr): + b = astr.split(',') + l = [x.strip() for x in b] + for i in range(len(l)): + m = item_re.match(l[i]) + if m: + j = int(m.group('index')) + l[i] = l[j] + return ','.join(l) + +def unique_key(adict): + """ Obtain a unique key given a dictionary.""" + allkeys = list(adict.keys()) + done = False + n = 1 + while not done: + newkey = '__l%s' % (n) + if newkey in allkeys: + n += 1 + else: + done = True + return newkey + + +template_name_re = re.compile(r'\A\s*(\w[\w\d]*)\s*\Z') +def expand_sub(substr, names): + substr = substr.replace(r'\>', '@rightarrow@') + substr = substr.replace(r'\<', '@leftarrow@') + lnames = find_repl_patterns(substr) + substr = named_re.sub(r"<\1>", substr) # get rid of definition templates + + def listrepl(mobj): + thelist = conv(mobj.group(1).replace(r'\,', '@comma@')) + if template_name_re.match(thelist): + return "<%s>" % (thelist) + name = None + for key in lnames.keys(): # see if list is already in dictionary + if lnames[key] == thelist: + name = key + if name is None: # this list is not in the dictionary yet + name = unique_key(lnames) + lnames[name] = thelist + return "<%s>" % name + + substr = list_re.sub(listrepl, substr) # convert all lists to named templates + # newnames are constructed as needed + + numsubs = None + base_rule = None + rules = {} + for r in template_re.findall(substr): + if r not in rules: + thelist = lnames.get(r, names.get(r, None)) + if thelist is None: + raise ValueError('No replicates found for <%s>' % (r)) + if r not in names and not thelist.startswith('_'): + names[r] = thelist + rule = [i.replace('@comma@', ',') for i in thelist.split(',')] + num = len(rule) + + if numsubs is None: + numsubs = num + rules[r] = rule + base_rule = r + elif num == numsubs: + rules[r] = rule + else: + print("Mismatch in number of replacements (base <{}={}>) " + "for <{}={}>. Ignoring." + .format(base_rule, ','.join(rules[base_rule]), r, thelist)) + if not rules: + return substr + + def namerepl(mobj): + name = mobj.group(1) + return rules.get(name, (k+1)*[name])[k] + + newstr = '' + for k in range(numsubs): + newstr += template_re.sub(namerepl, substr) + '\n\n' + + newstr = newstr.replace('@rightarrow@', '>') + newstr = newstr.replace('@leftarrow@', '<') + return newstr + +def process_str(allstr): + newstr = allstr + writestr = '' + + struct = parse_structure(newstr) + + oldend = 0 + names = {} + names.update(_special_names) + for sub in struct: + cleanedstr, defs = find_and_remove_repl_patterns(newstr[oldend:sub[0]]) + writestr += cleanedstr + names.update(defs) + writestr += expand_sub(newstr[sub[0]:sub[1]], names) + oldend = sub[1] + writestr += newstr[oldend:] + + return writestr + +include_src_re = re.compile( + r"(\n|\A)\s*include\s*['\"](?P[\w\d./\\]+\.src)['\"]", + re.I +) + +def resolve_includes(source): + d = os.path.dirname(source) + with open(source) as fid: + lines = [] + for line in fid: + m = include_src_re.match(line) + if m: + fn = m.group('name') + if not os.path.isabs(fn): + fn = os.path.join(d, fn) + if os.path.isfile(fn): + lines.extend(resolve_includes(fn)) + else: + lines.append(line) + else: + lines.append(line) + return lines + +def process_file(source): + lines = resolve_includes(source) + return process_str(''.join(lines)) + +_special_names = find_repl_patterns(''' +<_c=s,d,c,z> +<_t=real,double precision,complex,double complex> + + + + + +''') + +# END OF CODE VENDORED FROM `numpy.distutils.from_template` +########################################################### + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("infile", type=str, + help="Path to the input file") + parser.add_argument("-o", "--outdir", type=str, + help="Path to the output directory") + args = parser.parse_args() + + if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')): + raise ValueError(f"Input file has unknown extension: {args.infile}") + + outdir_abs = os.path.join(os.getcwd(), args.outdir) + + # Write out the .pyf/.f file + if args.infile.endswith(('.pyf.src', '.f.src')): + code = process_file(args.infile) + fname_pyf = os.path.join(args.outdir, + os.path.splitext(os.path.split(args.infile)[1])[0]) + + with open(fname_pyf, 'w') as f: + f.write(code) + else: + fname_pyf = args.infile + + # Now invoke f2py to generate the C API module file + if args.infile.endswith(('.pyf.src', '.pyf')): + p = subprocess.Popen([sys.executable, '-m', 'numpy.f2py', fname_pyf, + '--build-dir', outdir_abs], #'--quiet'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.getcwd()) + out, err = p.communicate() + if not (p.returncode == 0): + raise RuntimeError(f"Writing {args.outfile} with f2py failed!\n" + f"{out}\n" + r"{err}") + + +if __name__ == "__main__": + main() diff --git a/benchmark/pybench/openblas_wrap/meson.build b/benchmark/pybench/openblas_wrap/meson.build new file mode 100644 index 000000000..9f1b71787 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/meson.build @@ -0,0 +1,50 @@ +# find numpy & f2py includes +inc_numpy = run_command(py3, + ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'], + check : true +).stdout().strip() + +inc_f2py = run_command(py3, + ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'], + check : true +).stdout().strip() + + +inc_np = include_directories(inc_numpy, inc_f2py) +fortranobject_c = inc_f2py / 'fortranobject.c' + + +fortranobject_lib = static_library('_fortranobject', + fortranobject_c, +# c_args: numpy_nodepr_api, + dependencies: py3_dep, + include_directories: [inc_np, inc_f2py], + gnu_symbol_visibility: 'hidden', +) +fortranobject_dep = declare_dependency( + link_with: fortranobject_lib, + include_directories: [inc_np, inc_f2py], +) + + +# f2py generated wrappers + +flapack_module = custom_target('flapack_module', + output: ['_flapackmodule.c'], + input: 'blas_lapack.pyf.src', + command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'], +) + +py3.extension_module('_flapack', + flapack_module, + link_args: [], # version_link_args, + dependencies: [openblas_dep, fortranobject_dep], + install: true, + subdir: 'openblas_wrap' +) + + +py3.install_sources( + ['__init__.py'], + subdir: 'openblas_wrap' +) diff --git a/benchmark/pybench/scipy_openblas.pc b/benchmark/pybench/scipy_openblas.pc new file mode 100644 index 000000000..2348fac62 --- /dev/null +++ b/benchmark/pybench/scipy_openblas.pc @@ -0,0 +1,12 @@ +libdir=/home/br/repos/OpenBLAS/ +includedir=/home/br/repos/OpenBLAS/ +openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64 +version=0.3.27 +extralib=-lm -lpthread -lgfortran -lquadmath -L${libdir} -lopenblas +Name: openblas +Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version +Version: ${version} +URL: https://github.com/xianyi/OpenBLAS +Libs: -L${libdir} -lopenblas +Libs.private: ${extralib} +Cflags: -I${includedir}