Merge pull request #4961 from h-vetinari/flang

explicitly link to OpenMP
2024-10-31 17:46:55 +01:00 · 2024-11-01 00:55:50 +11:00 · 2024-10-30 18:37:26 +01:00 · 2024-10-30 16:04:00 +01:00 · 2024-10-30 14:35:57 +01:00 · 2024-10-30 12:56:16 +01:00
391 changed files with 38001 additions and 7903 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -41,7 +41,7 @@ macos_instance:
 #  - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
  
 macos_instance:
-  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+  image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
 task:
  name: AppleM1/LLVM x86_64 xbuild
  compile_script:
@ -58,8 +58,8 @@ task:
  - export VALID_ARCHS="i386 x86_64"
  - xcrun --sdk macosx --show-sdk-path
  - xcodebuild -version
-  - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64"
+  - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64"
  - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
  always:
    config_artifacts:
@ -70,7 +70,7 @@ task:
 #    type: application/octet-streamm

 macos_instance:
-  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+  image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
 task:
  name: AppleM1/LLVM armv8-ios xbuild
  compile_script:
@ -78,8 +78,10 @@ task:
  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
-  - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - xcrun --sdk iphoneos --show-sdk-path
+  - ls -l /Applications
  - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
  always:
    config_artifacts:
@ -87,20 +89,13 @@ task:
      type: text/plain

 macos_instance:
-  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+  image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
 task:
  name: AppleM1/LLVM armv7-androidndk xbuild
  compile_script:
-  - brew install android-ndk
-  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
-  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
-  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
-  - ls /System/Volumes/Data/opt/homebrew
-  - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/
-  - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
-  - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-  - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
-  - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
+  - brew install --cask android-ndk
+  - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
+  - export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
  - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
  always:
    config_artifacts:
@ -132,7 +127,7 @@ task:
 FreeBSD_task:
  name: FreeBSD-gcc12
  freebsd_instance:
-    image_family: freebsd-13-2
+    image_family: freebsd-13-3
  install_script:
  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
  compile_script:
@ -143,7 +138,7 @@ FreeBSD_task:
 FreeBSD_task:
  name: freebsd-gcc12-ilp64
  freebsd_instance:
-    image_family: freebsd-13-2
+    image_family: freebsd-13-3
  install_script:
  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
  compile_script:
@ -153,10 +148,10 @@ FreeBSD_task:
 FreeBSD_task:
  name: FreeBSD-clang-openmp
  freebsd_instance:
-    image_family: freebsd-13-2
+    image_family: freebsd-13-3
  install_script:
  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
-  - ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
+  - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
  compile_script:
  - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1

--- a/.github/workflows/codspeed-bench.yml
+++ b/.github/workflows/codspeed-bench.yml
@ -0,0 +1,157 @@
+name: Run codspeed benchmarks
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  benchmarks:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        fortran: [gfortran]
+        build: [make]
+        pyver: ["3.12"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+            python-version: ${{ matrix.pyver }}
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          fi
+
+      - name: Install Dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get update
+            sudo apt-get install -y gfortran cmake ccache libtinfo5
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          # GNU make and cmake call the compilers differently. It looks like
+          # that causes the cache to mismatch. Keep the ccache for both build
+          # tools separate to avoid polluting each other.
+          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
+          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
+            ccache-${{ runner.os }}-${{ matrix.build }}
+
+      - name: Write out the .pc
+        run: |
+             cd benchmark/pybench
+             cat > openblas.pc << EOF
+             libdir=${{ github.workspace }}
+             includedir= ${{ github.workspace }}
+             openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
+             version=0.0.99
+             extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
+             Name: openblas
+             Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
+             Version: ${version}
+             URL: https://github.com/xianyi/OpenBLAS
+             Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }}
+             Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
+             Cflags: -I${{ github.workspace}}
+             EOF
+             cat openblas.pc
+
+      - name: Configure ccache
+        run: |
+          if [ "${{ matrix.build }}" = "make" ]; then
+            # Add ccache to path
+            if [ "$RUNNER_OS" = "Linux" ]; then
+              echo "/usr/lib/ccache" >> $GITHUB_PATH
+            elif [ "$RUNNER_OS" = "macOS" ]; then
+              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
+            else
+              echo "::error::$RUNNER_OS not supported"
+              exit 1
+            fi
+          fi
+          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Build OpenBLAS
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
+              ;;
+            "cmake")
+              mkdir build && cd build
+              cmake -DDYNAMIC_ARCH=1 \
+                    -DNOFORTRAN=0 \
+                    -DBUILD_WITHOUT_LAPACK=0 \
+                    -DCMAKE_VERBOSE_MAKEFILE=ON \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
+                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
+                    ..
+              cmake --build .
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
+
+      - name: Show ccache status
+        continue-on-error: true
+        run: ccache -s
+
+      - name: Install benchmark dependencies
+        run: pip install meson ninja numpy pytest pytest-codspeed --user
+
+      - name: Build the wrapper
+        run: |
+          cd benchmark/pybench
+          export PKG_CONFIG_PATH=$PWD
+          meson setup build  --prefix=$PWD/build-install
+          meson install -C build
+          #
+          # sanity check
+          cd build/openblas_wrap
+          python -c'import _flapack; print(dir(_flapack))'
+
+      - name: Run benchmarks under pytest-benchmark
+        run: |
+          cd benchmark/pybench
+          pip install pytest-benchmark
+          export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/
+          OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd'
+
+      - name: Run benchmarks
+        uses: CodSpeedHQ/action@v2
+        with:
+          token: ${{ secrets.CODSPEED_TOKEN }}
+          run: |
+            cd benchmark/pybench
+            export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/
+            OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed
+
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@ -1,22 +1,38 @@
 name: Publish docs via GitHub Pages
+
 on:
  push:
    branches:
      - develop
+  pull_request:
+    branches:
+      - develop
+
 jobs:
  build:
    name: Deploy docs
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.10"
-      - run: pip install mkdocs mkdocs-material
-      # mkdocs gh-deploy command only builds to the top-level, hence building then deploying ourselves
-      - run: mkdocs build
+
+      - name: Install MkDocs and doc theme packages
+        run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin
+
+      - name: Build docs site
+        run: mkdocs build
+
+      # mkdocs gh-deploy command only builds to the top-level, hence deploying
+      # with this action instead.
+      # Deploys to http://www.openmathlib.org/OpenBLAS/docs/
      - name: Deploy docs
-        uses: peaceiris/actions-gh-pages@v3
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
        if: ${{ github.ref == 'refs/heads/develop' }}
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@ -9,22 +9,31 @@ concurrency:
 jobs:
  TEST:
    if: "github.repository == 'OpenMathLib/OpenBLAS'"
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
    strategy:
      fail-fast: false
      matrix:
        include:
          - target: LOONGSONGENERIC
-            triple:  loongarch64-unknown-linux-gnu
+            triple:  loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
          - target: LOONGSON3R5
-            triple: loongarch64-unknown-linux-gnu
+            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
          - target: LOONGSON2K1000
-            triple: loongarch64-unknown-linux-gnu
+            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
+          - target: LA64_GENERIC
+            triple:  loongarch64-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
+          - target: LA464
+            triple: loongarch64-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
+          - target: LA264
+            triple: loongarch64-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
          - target: DYNAMIC_ARCH
-            triple: loongarch64-unknown-linux-gnu
+            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC

    steps:
@ -33,21 +42,9 @@ jobs:

      - name: Install APT deps
        run: |
-          sudo add-apt-repository ppa:savoury1/virtualisation
-          sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
-          qemu-user-static
-
-      - name: Download and install loongarch64-toolchain
-        run: |
-          wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
-          #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
-          tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
-
-      - name: Set env
-        run: |
-          echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
-          echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
+          sudo apt-get update && \
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache qemu-user-static \
+                               gcc-14-loongarch64-linux-gnu g++-14-loongarch64-linux-gnu gfortran-14-loongarch64-linux-gnu

      - name: Compilation cache
        uses: actions/cache@v3
@ -68,11 +65,13 @@ jobs:
      - name: Disable utest dsdot:dsdot_n_1
        run: |
          echo -n > utest/test_dsdot.c
-          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
+          echo "Due to the current version of qemu causing utest cases to fail,"
          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."

      - name: Build OpenBLAS
-        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+        run: |
+          make CC='ccache ${{ matrix.triple }}-gcc-14 -static' FC='ccache ${{ matrix.triple }}-gfortran-14 -static' \
+          RANLIB='ccache ${{ matrix.triple }}-gcc-ranlib-14' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)

      - name: Test
        run: |
--- a/.github/workflows/loongarch64_clang.yml
+++ b/.github/workflows/loongarch64_clang.yml
@ -0,0 +1,141 @@
+name: loongarch64 clang qemu test
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: LOONGSONGENERIC
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
+          - target: LOONGSON3R5
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
+          - target: LOONGSON2K1000
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
+          - target: LA64_GENERIC
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
+          - target: LA464
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
+          - target: LA264
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
+          - target: DYNAMIC_ARCH
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Install libffi6
+        run: |
+          wget http://ftp.ca.debian.org/debian/pool/main/libf/libffi/libffi6_3.2.1-9_amd64.deb
+          sudo dpkg -i libffi6_3.2.1-9_amd64.deb
+
+      - name: Install APT deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
+
+      - name: Download and install loongarch64-toolchain
+        run: |
+          wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz
+          wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz
+          tar -xf clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz -C /opt
+          tar -xf loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz -C /opt
+
+      - name: Checkout qemu
+        uses: actions/checkout@v3
+        with:
+          repository: qemu/qemu
+          path: qemu
+          ref: master
+
+      - name: Install qemu
+        run: |
+          cd qemu
+          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static
+          make -j$(nproc)
+          make install
+
+      - name: Set env
+        run: |
+          echo "PATH=$GITHUB_WORKSPACE:/opt/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10/bin:/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/bin:$PATH" >> $GITHUB_ENV
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Disable utest dsdot:dsdot_n_1
+        run: |
+          echo -n > utest/test_dsdot.c
+          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
+          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
+
+      - name: Build OpenBLAS
+        run: make CC='ccache clang --target=loongarch64-linux-gnu --sysroot=/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/loongarch64-linux-gnu/sysroot/ -static' FC='ccache loongarch64-linux-gnu-gfortran -static' HOSTCC='ccache clang' CROSS_SUFFIX=llvm-  NO_SHARED=1 ${{ matrix.opts }} -j$(nproc)
+
+      - name: Test
+        run: |
+          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
+          qemu-loongarch64 ./utest/openblas_utest
+          qemu-loongarch64 ./utest/openblas_utest_ext
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
+
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@ -69,7 +69,7 @@ jobs:
          mv *.bottle.tar.gz bottles

      - name: Upload bottle
-        uses: actions/upload-artifact@v1
+        uses: actions/upload-artifact@v3
        with:
          name: openblas--HEAD.catalina.bottle.tar.gz
          path: bottles
--- a/.github/workflows/riscv64_vector.yml
+++ b/.github/workflows/riscv64_vector.yml
@ -28,6 +28,9 @@ jobs:
          - target: RISCV64_ZVL256B
            opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
+          - target: DYNAMIC_ARCH=1
+            opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1
+            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64

    steps:
      - name: Checkout repository
--- a/.gitignore
+++ b/.gitignore
@ -109,3 +109,4 @@ benchmark/smallscaling
 CMakeCache.txt
 CMakeFiles/*
 .vscode
+**/__pycache__
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,13 +2,13 @@
 ## Author: Hank Anderson <hank@statease.com>
 ##

-cmake_minimum_required(VERSION 2.8.5)
+cmake_minimum_required(VERSION 3.16.0)

 project(OpenBLAS C ASM)

 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 27.dev)
+set(OpenBLAS_PATCH_VERSION 28.dev)

 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS

 option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)

+set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")
+
 option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)

 option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF

 option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)

-option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
+option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF)

 option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)

@ -100,6 +102,10 @@ endif()

 message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")

+if (USE_OPENMP)
+  find_package(OpenMP REQUIRED)
+endif ()
+
 include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
 include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")

@ -256,6 +262,15 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
  endif()
 endif()

+if (USE_OPENMP)
+  if(BUILD_STATIC_LIBS)
+    target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C)
+  endif()
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C)
+  endif()
+endif()
+
 # Seems that this hack doesn't required since macOS 11 Big Sur
 if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
  set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -198,6 +198,9 @@ In chronological order:
 * PingTouGe Semiconductor Co., Ltd.
  * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910

+* Jake Arkinstall <https://github.com/jake-arkinstall>
+  * [2021-02-10] Remove in-source configure_file to enable builds in read-only contexts (issue #3100, PR #3101)
+
 * River Dillon <oss@outerpassage.net>
  * [2021-07-10] fix compilation with musl libc

@ -223,3 +226,6 @@ In chronological order:

 * Dirreke <https://github.com/mseminatore>
  * [2024-01-16] Add basic support for the CSKY architecture
+
+* Christopher Daley <https://github.com/cdaley>
+  * [2024-01-24] Optimize GEMV forwarding on ARM64 systems
--- a/Changelog.txt
+++ b/Changelog.txt
@ -1,4 +1,127 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.28
+ 8-Aug-2024
+
+general:
+- Reworked the unfinished implementation of HUGETLB from GotoBLAS
+  for allocating huge memory pages as buffers on suitable systems
+- Changed the unfinished implementation of GEMM3M for the generic
+  target on all architectures to at least forward to regular GEMM
+- Improved multithreaded GEMM performance for large non-skinny matrices
+- Improved BLAS3 performance on larger multicore systems through improved
+  parallelism
+- Improved performance of the initial memory allocation by reducing
+  locking overhead
+- Improved performance of GBMV at small problem sizes by introducing
+  a size barrier for the switch to multithreading
+- Added an implementation of the CBLAS_GEMM_BATCH extension
+- Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in 
+  CMAKE builds (error introduced in 0.3.27)
+- Fixed corner cases involving the handling of NAN and INFINITY
+  arguments in ?SCAL on all architectures
+- Added support for cross-compiling to WEBM with CMAKE (in addition
+  to the already present makefile support)
+- Fixed NAN handling and potential accuracy issues in compilations with
+  Intel ICX by supplying a suitable fp-model option by default
+- The contents of the github project wiki have been converted into
+  a new set of documentation included with the source code.
+- It is now possible to register a callback function that replaces
+  the built-in support for multithreading with an external backend
+  like TBB (openblas_set_threads_callback_function)
+- Fixed potential duplication of suffixes in shared library naming
+- Improved C compiler detection by the build system to tolerate more
+  naming variants for gcc builds
+- Fixed an unnecessary dependency of the utest on CBLAS
+- Fixed spurious error reports from the BLAS extensions utest
+- Fixed unwanted invocation of the GEMM3M tests in cross-compilation
+- Fixed a flaw in the makefile build that could lead to the pkgconfig
+  file containing an entry of UNKNOWN for the target cpu after installing
+- Integrated fixes from the Reference-LAPACK project:
+  - Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961)
+  - Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018)
+  - Fixed potential infinite loop in the LAPACK testsuite (PR 1024)
+  - Make the variable type used for hidden length arguments configurable (PR 1025)  
+  - Fixed SYTRD workspace computation and various typos (PR 1030)
+  - Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033)
+
+x86-64:
+- reverted thread management under Windows to its state before 0.3.26
+  due to signs of race conditions in some circumstances now under study
+- fixed accidental selection of the unoptimized generic SBGEMM kernel
+  in CMAKE builds for CooperLake and SapphireRapids targets
+- fixed a potential thread buffer overrun in SBSTOBF16 on small systems
+- fixed an accuracy issue in ZSCAL introduced in 0.3.26
+- fixed compilation with CMAKE and recent releases of LLVM
+- added support for Intel Emerald Rapids and Meteor Lake cpus
+- added autodetection support for the Zhaoxin KX-7000 cpu
+- fixed autodetection of Intel Prescott (probably broken since 0.3.19)
+- fixed compilation for older targets with the Yocto SDK
+- fixed compilation of the converter-generated C versions
+  of the LAPACK sources with gcc-14
+- improved compiler options when building with CMAKE and LLVM for
+  AVX512-capable targets
+- added support for supplying the L2 cache size via an environment
+  variable (OPENBLAS_L2_SIZE) in case it is not correctly reported
+  (as in some VM configurations)
+- improved the error message shown when thread creation fails on startup
+- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
+
+arm:
+- fixed building for baremetal targets with make
+
+arm64:
+- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
+  matrix to the corresponding GEMV kernel 
+- added optimized SGEMV and DGEMV kernels for A64FX
+- added optimized SVE kernels for small-matrix GEMM
+- added A64FX to the cpu list for DYNAMIC_ARCH
+- fixed building with support for cpu affinity
+- worked around accuracy problems with C/ZNRM2 on NeoverseN1 and
+  Apple M targets
+- improved GEMM performance on Neoverse V1
+- fixed compilation for NEOVERSEN2 with older compilers
+- fixed potential miscompilation of the SVE SDOT and DDOT kernels
+- fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels
+- fixed a potential overflow when using very large user-defined BUFFERSIZE
+- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
+
+power:
+- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
+  matrix to the corresponding GEMV kernel 
+- significantly improved performance of SBGEMM on POWER10
+- fixed compilation with OpenMP and the XLF compiler
+- fixed building of the BLAS extension utests under AIX
+- fixed building of parts of the LAPACK testsuite with XLF
+- fixed CSWAP/ZSWAP on big-endian POWER10 targets
+- fixed a performance regression in SAXPY on POWER10 with OpenXL
+- fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM
+- fixed building for POWER9 under FreeBSD
+- fixed a potential overflow when using very large user-defined BUFFERSIZE
+- fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV
+
+riscv64:
+- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
+  matrix to the corresponding GEMV kernel 
+- fixed building for RISCV64_GENERIC with OpenMP enabled
+- added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two
+  RVV 1.0 targets with vector length of 128 and 256)
+- worked around the ZVL128B kernels for AXPBY mishandling the special
+  case of zero Y increment
+
+loongarch64:
+- improved GEMM performance on servers of the 3C5000 generation
+- improved performance and stability of DGEMM
+- improved GEMV and TRSM kernels for LSX and LASX vector ABIs
+- fixed CMAKE compilation with the INTERFACE64 option set
+- fixed compilation with CMAKE
+- worked around spurious errors flagged by the BLAS3 tests
+- worked around a miscompilation of the POTRS utest by gcc 14.1
+
+mips64:
+- fixed ASUM and SUM kernels to accept negative step sizes in X
+- fixed complex GEMV kernels for MSA
+
 ====================================================================
 Version 0.3.27
 4-Apr-2024
--- a/Jenkinsfile.pwr
+++ b/Jenkinsfile.pwr
@ -1,7 +1,7 @@
 pipeline {
    agent { 
        docker {
-            image 'osuosl/ubuntu-ppc64le'
+            image 'osuosl/ubuntu-ppc64le:18.04'
        }
    }
    stages {
--- a/4
+++ b/4
@ -45,6 +45,10 @@ else
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
 endif

+ifdef LAPACK_STRLEN
+LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN)
+endif
+
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test

 .PHONY : all libs netlib $(RELA) test ctest shared install
--- a/Makefile.arm64
+++ b/Makefile.arm64
@ -176,6 +176,16 @@ endif
 endif
 endif

+# Detect ARM Neoverse V2.
+ifeq ($(CORE), NEOVERSEV2)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
+CCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2
+endif
+endif
+endif
+
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
@ -266,12 +276,19 @@ endif
 endif
 endif

-ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifeq ($(CORE), A64FX)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ3) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
 endif
+else
+CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-n1
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-n1
+endif
+endif
 endif
 endif

--- a/Makefile.install
+++ b/Makefile.install
@ -14,6 +14,9 @@ endif
 ifeq ($(INTERFACE64),1)
 USE_64BITINT=1
 endif
+ifeq ($(USE_OPENMP),1)
+	FOMP_OPT:= -fopenmp
+endif

 PREFIX ?= /opt/OpenBLAS

@ -72,18 +75,18 @@ ifndef NO_CBLAS
 	@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
 	@cp cblas.h cblas.tmp
 ifdef SYMBOLPREFIX
-	@sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
-	@sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
+	@sed 's/cblas[^() ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
+	@sed 's/openblas[^() ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
 	#change back any openblas_complex_float and double that got hit
 	@sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g'  cblas.tmp > cblas.tmp2
-	@sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
+	@sed 's/goto[^() ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
 endif
 ifdef SYMBOLSUFFIX
-	@sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
-	@sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
+	@sed 's/cblas[^() ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
+	@sed 's/openblas[^() ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
 	#change back any openblas_complex_float and double that got hit
 	@sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g'  cblas.tmp > cblas.tmp2
-	@sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
+	@sed 's/goto[^() ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
 endif
 	@sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
 endif
@ -178,7 +181,8 @@ endif
 	@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
 	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
-	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
+	@echo 'omp_opt='$(FOMP_OPT) >> "$(PKGFILE)"
+	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
 	@echo 'version='$(VERSION) >> "$(PKGFILE)"
 	@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
 	@cat openblas.pc.in >> "$(PKGFILE)"
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@ -8,13 +8,13 @@ FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_ZVL256B)
 CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
-FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_ZVL128B)
 CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
-FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_GENERIC)
 CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
-FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
+FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
 endif
--- a/Makefile.rule
+++ b/Makefile.rule
@ -3,7 +3,7 @@
 #

 # This library's version
-VERSION = 0.3.27.dev
+VERSION = 0.3.28.dev

 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
 # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@ -134,6 +134,12 @@ VERSION = 0.3.27.dev
 # Build LAPACK Deprecated functions since LAPACK 3.6.0
 BUILD_LAPACK_DEPRECATED = 1

+# The variable type assumed for the length of character arguments when passing
+# data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC
+# versions used "int"). Mismatches will not cause runtime failures but may result
+# in build warnings or errors when building with link-time optimization (LTO)
+# LAPACK_STRLEN=int
+
 # Build RecursiveLAPACK on top of LAPACK
 # BUILD_RELAPACK = 1
 # Have RecursiveLAPACK actually replace standard LAPACK routines instead of 
@ -173,6 +179,10 @@ NO_AFFINITY = 1
 # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
 # BIGNUMA = 1

+# If you are compiling for an embedded system ("bare metal") like Cortex M series
+# Note that you will have to provide implementations of malloc() and free() in this case
+# EMBEDDED = 1
+
 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
 # and OS. However, the performance is low.
 # NO_AVX = 1
@ -215,6 +225,16 @@ NO_AFFINITY = 1
 # to the user space. If bigphysarea is enabled, it will use it.
 # DEVICEDRIVER_ALLOCATION = 1

+# Use large page allocation (called hugepage support in Linux context)
+# for the thread buffers (with access by shared memory operations)
+# HUGETLB_ALLOCATION = 1
+
+# Use large page allocation called hugepages in Linux) based on mmap accessing
+# a memory-backed pseudofile (requires hugetlbfs to be mounted in the system,
+# the example below has it mounted on /hugepages. OpenBLAS will create the backing
+# file as gotoblas.processid in that path)
+# HUGETLBFILE_ALLOCATION = /hugepages
+
 # If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
 # CONSISTENT_FPCSR = 1

--- a/Makefile.system
+++ b/Makefile.system
@ -268,10 +268,34 @@ SMALL_MATRIX_OPT = 1
 else ifeq ($(ARCH), power)
 SMALL_MATRIX_OPT = 1
 BUILD_BFLOAT16 = 1
+else ifeq ($(ARCH), arm64)
+SMALL_MATRIX_OPT = 1
 endif
+ifeq ($(ARCH), loongarch64)
+SMALL_MATRIX_OPT = 1
+endif
+ifeq ($(ARCH), arm64)
+GEMM_GEMV_FORWARD = 1
+endif
+ifeq ($(ARCH), riscv)
+GEMM_GEMV_FORWARD = 1
+endif
+ifeq ($(ARCH), power)
+GEMM_GEMV_FORWARD = 1
+GEMM_GEMV_FORWARD_BF16 = 1
+endif
+
 ifeq ($(SMALL_MATRIX_OPT), 1)
 CCOMMON_OPT += -DSMALL_MATRIX_OPT
 endif
+ifneq ($(ONLY_CBLAS), 1)
+ifeq ($(GEMM_GEMV_FORWARD), 1)
+CCOMMON_OPT += -DGEMM_GEMV_FORWARD
+endif
+ifeq ($(GEMM_GEMV_FORWARD_BF16), 1)
+CCOMMON_OPT += -DGEMM_GEMV_FORWARD_BF16
+endif
+endif

 # This operation is expensive, so execution should be once.
 ifndef GOTOBLAS_MAKEFILE
@ -683,6 +707,7 @@ ifneq ($(NO_SVE), 1)
 DYNAMIC_CORE += NEOVERSEV1
 DYNAMIC_CORE += NEOVERSEN2
 DYNAMIC_CORE += ARMV8SVE
+DYNAMIC_CORE += A64FX
 endif
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
@ -706,7 +731,18 @@ endif
 endif

 ifeq ($(ARCH), loongarch64)
-DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
+DYNAMIC_CORE = LA64_GENERIC LA264 LA464
+endif
+
+ifeq ($(ARCH), riscv64)
+DYNAMIC_CORE = RISCV64_GENERIC
+DYNAMIC_CORE += RISCV64_ZVL128B
+DYNAMIC_CORE += RISCV64_ZVL256B
+ifdef DYNAMIC_LIST
+override DYNAMIC_CORE = RISCV64_GENERIC $(DYNAMIC_LIST)
+XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_RISCV64_GENERIC
+XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
+endif
 endif

 ifeq ($(ARCH), zarch)
@ -811,8 +847,12 @@ ifeq ($(ARCH), arm)
 NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1

+ifneq ($(EMBEDDED), 1)
 CCOMMON_OPT += -marm
 FCOMMON_OPT += -marm
+else
+CCOMMON_OPT += -DOS_EMBEDDED -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16
+endif

 # If softfp abi is mentioned on the command line, force it.
 ifeq ($(ARM_SOFTFP_ABI), 1)
@ -955,12 +995,18 @@ endif

 ifeq ($(ARCH), loongarch64)
 LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
+LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64)
 ifneq ($(LA64_ABI), lp64d)
 LA64_ABI=lp64
 endif
+ifneq ($(LA64_ARCH), loongarch64)
+CCOMMON_OPT += -mabi=$(LA64_ABI)
+FCOMMON_OPT += -mabi=$(LA64_ABI)
+else
 CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
 FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
 endif
+endif

 endif

@ -1579,13 +1625,23 @@ ifdef FUNCTION_PROFILE
 CCOMMON_OPT	+= -DFUNCTION_PROFILE
 endif

+ifdef SHMEM_ALLOCATION
+ifneq ($(SHMEM_ALLOCATION), 0)
+CCOMMON_OPT	+= -DALLOC_SHM
+endif
+endif
+
 ifdef HUGETLB_ALLOCATION
+ifneq ($(HUGETLB_ALLOCATION), 0)
 CCOMMON_OPT	+= -DALLOC_HUGETLB
 endif
+endif

 ifdef HUGETLBFILE_ALLOCATION
+ifneq ($(HUGETLBFILE_ALLOCATION), 0)
 CCOMMON_OPT	+= -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION)
 endif
+endif

 ifdef STATIC_ALLOCATION
 CCOMMON_OPT	+= -DALLOC_STATIC
@ -1668,8 +1724,8 @@ LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx
 override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 endif
 ifeq ($(F_COMPILER),FLANGNEW)
-LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
+LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
+override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
 endif

 LAPACK_CFLAGS = $(CFLAGS)
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@ -8,6 +8,11 @@ endif
 endif
 endif

+ifeq ($(C_COMPILER), CLANG)
+ifeq ($(findstring icx,$(CC)),icx)
+CCOMMON_OPT += -fp-model=consistent
+endif
+endif

 ifneq ($(DYNAMIC_ARCH),1)
 ADD_CPUFLAGS = 1
--- a/README.md
+++ b/README.md
@ -2,12 +2,8 @@

 [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)

-Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS)
-
-AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
-
 Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
-<!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
+


 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
@ -19,7 +15,7 @@ OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=O

 OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.

-Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
+Please read the documentation in the OpenBLAS folder: <https://github.com/OpenMathLib/OpenBLAS/docs>.

 For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
 <https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
@ -31,12 +27,12 @@ We provide official binary packages for the following platform:

  * Windows x86/x86_64

-You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
+You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/OpenMathLib/OpenBLAS/releases](https://github.com/OpenMathLib/OpenBLAS/releases).

 ## Installation from Source

-Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
-using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
+Download from project homepage, https://github.com/OpenMathLib/OpenBLAS/, or check out the code
+using Git from https://github.com/OpenMathLib/OpenBLAS.git. (If you want the most up to date version, be
 sure to use the develop branch - master is several years out of date due to a change of maintainership.)
 Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
 Most can also be given directly on the make or cmake command line.
@ -45,10 +41,10 @@ Most can also be given directly on the make or cmake command line.

 Building OpenBLAS requires the following to be installed:

-* GNU Make
+* GNU Make or CMake
 * A C compiler, e.g. GCC or Clang
 * A Fortran compiler (optional, for LAPACK)
-* IBM MASS (optional, see below)
+

 ### Normal compile

@ -66,24 +62,24 @@ build options you plan to set.

 ### Cross compile

-Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
+Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler.
 The target must be specified explicitly when cross compiling.

 Examples:

-* On an x86 box, compile this library for a loongson3a CPU:
+* On a Linux system, cross-compiling to an older MIPS64 router board:
  ```sh
-  make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
+  make BINARY=64 CC=mipsisa64r6el-linux-gnuabi64-gcc FC=mipsisa64r6el-linux-gnuabi64-gfortran HOSTCC=gcc TARGET=P6600
  ```
-  or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI:
+*  or to a Windows x64 host: 
  ```sh
-  make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A
+  make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc
  ```

-* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
-  ```sh
-  make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu-   NO_LAPACKE=1 NO_SHARED=1 BINARY=32
-  ```
+You can find instructions for other cases both in the "Supported Systems" section below and in the docs folder. The .yml scripts included with the sources (which contain the
+build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints.
+
+When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build.

 ### Debug version

@ -188,7 +184,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th

 - **AIX**: Dynamic architecture with OpenXL and OpenMP.
  ```sh
-  make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
+  make CC=ibm-clang_r FC=xlf_r TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
  ```

 #### IBM zEnterprise System
@ -219,6 +215,26 @@ e.g.:
    HOSTCC=gcc HOSTFC=gfortran -j
  ```

+#### LOONGARCH64
+
+- **LA64_GENERIC**: Optimized Level-3, Level-2 and Level-1 BLAS with scalar instruction
+  ```sh
+  make HOSTCC=gcc TARGET=LA64_GENERIC CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
+  ```
+  The old-style TARGET=LOONGSONGENERIC is still supported
+
+- **LA264**: Optimized Level-3, Level-2 and Level-1 BLAS with LSX instruction
+  ```sh
+  make HOSTCC=gcc TARGET=LA264 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
+  ```
+  The old-style TARGET=LOONGSON2K1000 is still supported
+
+- **LA464**: Optimized Level-3, Level-2 and Level-1 BLAS with LASX instruction
+  ```sh
+  make HOSTCC=gcc TARGET=LA464 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
+  ```
+  The old-style TARGET=LOONGSON3R5 is still supported
+
 ### Support for multiple targets in a single library

 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
@ -234,8 +250,14 @@ For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additi

 on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.

-The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
-common code in the library, usually you will want to set this to the oldest model you expect to encounter.
+On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support.  A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled.
+
+On **LoongArch64**, it comprises LA264 and LA464 as well as generic LoongArch64 support.
+
+The `TARGET` option can - and usually **should** - be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter.
+Failure to specify this may lead to advanced instructions being used by the compiler, just because the build host happens to support them. This is most likely to happen when aggressive optimization options are in effect, and the resulting library may then crash with an
+illegal instruction error on weaker hardware, before it even reaches the BLAS routines specifically included for that cpu.
+
 Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.

 ### Supported OS
@ -289,20 +311,21 @@ If you compile this library with `USE_OPENMP=1`, you should use the above functi

 ## Reporting bugs

-Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
+Please submit an issue in https://github.com/OpenMathLib/OpenBLAS/issues.

 ## Contact

+ Use github discussions: https://github.com/OpenMathLib/OpenBLAS/discussions
 * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
 * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev

 ## Change log

-Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
+Please see Changelog.txt.

 ## Troubleshooting

-* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
+* Please read the [FAQ](https://github.com/OpenMathLib/OpenBLAS/docs/faq,md) in the docs folder first.
 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
 * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
  Clang 3.0 will generate the wrong AVX binary code.
@ -319,9 +342,9 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2

 ## Contributing

-1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
+1. [Check for open issues](https://github.com/OpenMathLib/OpenBLAS/issues) or open a fresh issue
   to start a discussion around a feature idea or a bug.
-2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
+2. Fork the [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) repository to start making your changes.
 3. Write a test which shows that the bug was fixed or that the feature works as expected.
 4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.

--- a/TargetList.txt
+++ b/TargetList.txt
@ -126,9 +126,17 @@ x280
 RISCV64_ZVL256B

 11.LOONGARCH64:
+// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names,
+// and it is recommended to use the more standardized naming conventions
+// LA64_GENERIC/LA264/LA464. You can still specify TARGET as
+// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime,
+// and they will be internally relocated to LA64_GENERIC/LA264/LA464.
 LOONGSONGENERIC
-LOONGSON3R5
 LOONGSON2K1000
+LOONGSON3R5
+LA64_GENERIC
+LA264
+LA464

 12. Elbrus E2000:
 E2K
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -133,29 +133,29 @@ jobs:
      mkdir build 
      cd build
      call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-      cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
+      cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang-new -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
      cmake --build . --config Release
      ctest
-
+      ctest --rerun-failed --output-on-failure


 - job: OSX_OpenMP
  pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-12'
  steps:   
  - script: |
      brew update
-      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
-      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install
+      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-13 FC=gfortran-13
+      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-13 FC=gfortran-13 PREFIX=../blasinst install
      ls -lR ../blasinst
     
 - job: OSX_GCC_Nothreads
  pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-12'
  steps:   
  - script: |
      brew update
-      make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
+      make USE_THREADS=0 CC=gcc-13 FC=gfortran-13
     
 - job: OSX_GCC12
  pool:
@ -195,7 +195,7 @@ jobs:
      
 - job: OSX_dynarch_cmake
  pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-12'
  variables:
     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
     LIBRARY_PATH: /usr/local/opt/llvm/lib
@ -203,7 +203,7 @@ jobs:
  - script: |
      mkdir build
      cd build
-      cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
+      cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_Fortran_COMPILER=gfortran-13 -DBUILD_SHARED_LIBS=ON ..
      cmake --build .
      ctest

@ -212,7 +212,7 @@ jobs:
     vmImage: 'macOS-latest'
  variables:
     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
-     MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
+     MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/edb4dc2f-266f-47f2-8d56-21bc7764e119/m_HPCKit_p_2023.2.0.49443.dmg
     LIBRARY_PATH: /usr/local/opt/llvm/lib
     MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
  steps:   
@ -242,7 +242,7 @@ jobs:
 
 - job: OSX_NDK_ARMV7
  pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-12'
  steps:   
  - script: | 
      brew update
@ -252,35 +252,35 @@ jobs:

 - job: OSX_IOS_ARMV8
  pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-12'
  variables:
-     CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
+     CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0
  steps:
  - script: |
     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1

 - job: OSX_IOS_ARMV7
  pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-12'
  variables:
-     CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-     CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
+     CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1
  steps:
  - script: |
     make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1

 - job: OSX_xbuild_DYNAMIC_ARM64
  pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-12'
  variables:
-     CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
+     CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64
  steps:
  - script: |
-     ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
-     /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
-     /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
+     ls /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
+     /Applications/Xcode_12.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
+     /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1

 - job: ALPINE_MUSL
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@ -103,6 +103,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
       sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
       spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
+       somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \
       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)

 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@ -276,6 +277,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
       samin.goto damin.goto camin.goto zamin.goto \
       smin.goto dmin.goto \
       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
+       somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \
       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)

 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@ -2906,6 +2908,29 @@ dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
 dznrm2.atlas : dznrm2.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)

+###################################################################################################
+
+############################################ SOMATCOPY ############################################
+somatcopy.goto : somatcopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+###################################################################################################
+
+############################################ DOMATCOPY ############################################
+domatcopy.goto : domatcopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+###################################################################################################
+
+############################################ COMATCOPY ############################################
+comatcopy.goto : comatcopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+###################################################################################################
+
+############################################ ZOMATCOPY ############################################
+zomatcopy.goto : zomatcopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

 ###################################################################################################

@ -3435,6 +3460,18 @@ scnrm2.$(SUFFIX) : nrm2.c
 dznrm2.$(SUFFIX) : nrm2.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

+somatcopy.$(SUFFIX) : omatcopy.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+domatcopy.$(SUFFIX) : omatcopy.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+comatcopy.$(SUFFIX) : omatcopy.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zomatcopy.$(SUFFIX) : omatcopy.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+

 smallscaling: smallscaling.c ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
@ -3442,4 +3479,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling

-include $(TOPDIR)/Makefile.tail
+include $(TOPDIR)/Makefile.tail
--- a/benchmark/omatcopy.c
+++ b/benchmark/omatcopy.c
@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef OMATCOPY
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define OMATCOPY BLASFUNC(domatcopy)
+#else
+#define OMATCOPY BLASFUNC(somatcopy)
+#endif
+#else
+#ifdef DOUBLE
+#define OMATCOPY BLASFUNC(zomatcopy)
+#else
+#define OMATCOPY BLASFUNC(comatcopy)
+#endif
+#endif
+int main(int argc, char *argv[]){
+  FLOAT *a, *b;
+  FLOAT alpha[] = {1.0, 0.0};
+  char trans = 'N';
+  char order = 'C';
+  blasint crows, ccols, clda, cldb;
+  int loops = 1;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+  int i, j;
+
+  double time1, timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from = atol(*argv);            argc--; argv++; }
+  if (argc > 0) { to   = MAX(atol(*argv), from); argc--; argv++; }
+  if (argc > 0) { step = atol(*argv);            argc--; argv++; }
+
+  if ((p = getenv("OPENBLAS_TRANS"))) {
+    trans=*p;
+  }
+  if ((p = getenv("OPENBLAS_ORDER"))) {
+    order=*p;
+  }
+  TOUPPER(trans);
+  TOUPPER(order);
+  fprintf(stderr, "From : %3d  To : %3d Step=%d : Trans=%c : Order=%c\n", from, to, step, trans, order);
+  p = getenv("OPENBLAS_LOOPS");
+  if ( p != NULL ) {
+    loops = atoi(p);
+  }
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+  if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  for (i = 0; i < to * to * COMPSIZE; i++) {
+    a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+  }
+  for (i = 0; i < to * to * COMPSIZE; i++) {
+    b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+  }
+
+  fprintf(stderr, "          SIZE                   Flops             Time\n");
+  for (i = from; i <= to; i += step) {
+    cldb = clda = crows = ccols = i;
+    fprintf(stderr, " ROWS=%4d, COLS=%4d : ", (int)crows, (int)ccols);
+    begin();
+
+    for (j=0; j<loops; j++) {
+      OMATCOPY (&order, &trans, &crows, &ccols, alpha, a, &clda, b, &cldb);
+    }
+
+    end();
+    time1 = getsec();
+
+    timeg = time1/loops;
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * COMPSIZE * (double)ccols * (double)crows / timeg * 1.e-6, time1);
+  }
+
+  free(a);
+  free(b);
+
+  return 0;
+}
--- a/benchmark/pybench/README.md
+++ b/benchmark/pybench/README.md
@ -0,0 +1,49 @@
+# Continuous benchmarking of OpenBLAS performance
+
+We run a set of benchmarks of subset of OpenBLAS functionality.
+
+## Benchmark runner
+
+[![CodSpeed Badge](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/OpenMathLib/OpenBLAS/)
+
+Click on [benchmarks](https://codspeed.io/OpenMathLib/OpenBLAS/benchmarks) to see the performance of a particular benchmark over time;
+Click on [branches](https://codspeed.io/OpenMathLib/OpenBLAS/branches/) and then on the last PR link to see the flamegraphs.
+
+## What are the benchmarks
+
+We run raw BLAS/LAPACK subroutines, via f2py-generated python wrappers. The wrappers themselves are equivalent to [those from SciPy](https://docs.scipy.org/doc/scipy/reference/linalg.lapack.html).
+In fact, the wrappers _are_ from SciPy, we take a small subset simply to avoid having to build the whole SciPy for each CI run.
+
+
+## Adding a new benchmark
+
+`.github/workflows/codspeed-bench.yml` does all the orchestration on CI.
+
+Benchmarks live in the `benchmark/pybench` directory. It is organized as follows:
+
+- benchmarks themselves live in the `benchmarks` folder. Note that the LAPACK routines are imported from the `openblas_wrap` package.
+- the `openblas_wrap` package is a simple trampoline: it contains an f2py extension, `_flapack`, which talks to OpenBLAS, and exports the python names in its `__init__.py`.
+This way, the `openblas_wrap` package shields the benchmarks from the details of where a particular LAPACK function comes from. If wanted, you may for instance swap the `_flapack` extension to
+`scipy.linalg.blas` and `scipy.linalg.lapack`.
+
+To change parameters of an existing benchmark, edit python files in the `benchmark/pybench/benchmarks` directory.
+
+To add a benchmark for a new BLAS or LAPACK function, you need to:
+
+- add an f2py wrapper for the bare LAPACK function. You can simply copy a wrapper from SciPy (look for `*.pyf.src` files in https://github.com/scipy/scipy/tree/main/scipy/linalg)
+- add an import to `benchmark/pybench/openblas_wrap/__init__.py`
+
+
+## Running benchmarks locally
+
+This benchmarking layer is orchestrated from python, therefore you'll need to
+have all what it takes to build OpenBLAS from source, plus `python` and
+
+```
+$ python -mpip install numpy meson ninja pytest pytest-benchmark
+```
+
+The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`.
+
+An ASV compatible benchmark suite is planned but currently not implemented.
+
--- a/benchmark/pybench/benchmarks/bench_blas.py
+++ b/benchmark/pybench/benchmarks/bench_blas.py
@ -0,0 +1,274 @@
+import pytest
+import numpy as np
+import openblas_wrap as ow
+
+dtype_map = {
+    's': np.float32,
+    'd': np.float64,
+    'c': np.complex64,
+    'z': np.complex128,
+    'dz': np.complex128,
+}
+
+
+# ### BLAS level 1 ###
+
+# dnrm2
+
+dnrm2_sizes = [100, 1000]
+
+def run_dnrm2(n, x, incx, func):
+    res = func(x, n, incx=incx)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['d', 'dz'])
+@pytest.mark.parametrize('n', dnrm2_sizes)
+def test_nrm2(benchmark, n, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+
+    x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
+    nrm2 = ow.get_func('nrm2', variant)
+    result = benchmark(run_dnrm2, n, x, 1, nrm2)
+
+
+# ddot
+
+ddot_sizes = [100, 1000]
+
+def run_ddot(x, y, func):
+    res = func(x, y)
+    return res
+
+
+@pytest.mark.parametrize('n', ddot_sizes)
+def test_dot(benchmark, n):
+    rndm = np.random.RandomState(1234)
+
+    x = np.array(rndm.uniform(size=(n,)), dtype=float)
+    y = np.array(rndm.uniform(size=(n,)), dtype=float)
+    dot = ow.get_func('dot', 'd')
+    result = benchmark(run_ddot, x, y, dot)
+
+
+# daxpy
+
+daxpy_sizes = [100, 1000]
+
+def run_daxpy(x, y, func):
+    res = func(x, y, a=2.0)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
+@pytest.mark.parametrize('n', daxpy_sizes)
+def test_daxpy(benchmark, n, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+
+    x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
+    y = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
+    axpy = ow.get_func('axpy', variant)
+    result = benchmark(run_daxpy, x, y, axpy)
+
+
+# ### BLAS level 2 ###
+
+gemv_sizes = [100, 1000]
+
+def run_gemv(a, x, y, func):
+    res = func(1.0, a, x, y=y, overwrite_y=True)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
+@pytest.mark.parametrize('n', gemv_sizes)
+def test_dgemv(benchmark, n, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+
+    x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
+    y = np.empty(n, dtype=dtyp)
+
+    a = np.array(rndm.uniform(size=(n,n)), dtype=dtyp)
+    x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
+    y = np.zeros(n, dtype=dtyp)
+
+    gemv = ow.get_func('gemv', variant)
+    result = benchmark(run_gemv, a, x, y, gemv)
+
+    assert result is y
+
+
+# dgbmv
+
+dgbmv_sizes = [100, 1000]
+
+def run_gbmv(m, n, kl, ku, a, x, y, func):
+    res = func(m, n, kl, ku, 1.0, a, x, y=y, overwrite_y=True)
+    return res
+
+
+
+@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
+@pytest.mark.parametrize('n', dgbmv_sizes)
+@pytest.mark.parametrize('kl', [1])
+def test_dgbmv(benchmark, n, kl, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+
+    x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
+    y = np.empty(n, dtype=dtyp)
+
+    m = n
+
+    a = rndm.uniform(size=(2*kl + 1, n))
+    a = np.array(a, dtype=dtyp, order='F')
+
+    gbmv = ow.get_func('gbmv', variant)
+    result = benchmark(run_gbmv, m, n, kl, kl, a, x, y, gbmv)
+    assert result is y
+
+
+# ### BLAS level 3 ###
+
+# dgemm
+
+gemm_sizes = [100, 1000]
+
+def run_gemm(a, b, c, func):
+    alpha = 1.0
+    res = func(alpha, a, b, c=c, overwrite_c=True)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
+@pytest.mark.parametrize('n', gemm_sizes)
+def test_gemm(benchmark, n, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+    a = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F')
+    b = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F')
+    c = np.empty((n, n), dtype=dtyp, order='F')
+    gemm = ow.get_func('gemm', variant)
+    result = benchmark(run_gemm, a, b, c, gemm)
+    assert result is c
+
+
+# dsyrk
+
+syrk_sizes = [100, 1000]
+
+
+def run_syrk(a, c, func):
+    res = func(1.0, a, c=c, overwrite_c=True)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
+@pytest.mark.parametrize('n', syrk_sizes)
+def test_syrk(benchmark, n, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+    a = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F')
+    c = np.empty((n, n), dtype=dtyp, order='F')
+    syrk = ow.get_func('syrk', variant)
+    result = benchmark(run_syrk, a, c, syrk)
+    assert result is c
+
+
+# ### LAPACK ###
+
+# linalg.solve
+
+gesv_sizes = [100, 1000]
+
+
+def run_gesv(a, b, func):
+    res = func(a, b, overwrite_a=True, overwrite_b=True)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
+@pytest.mark.parametrize('n', gesv_sizes)
+def test_gesv(benchmark, n, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+
+    a = (np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F') +
+         np.eye(n, dtype=dtyp, order='F'))
+    b = np.array(rndm.uniform(size=(n, 1)), dtype=dtyp, order='F')
+    gesv = ow.get_func('gesv', variant)
+    lu, piv, x, info = benchmark(run_gesv, a, b, gesv)
+    assert lu is a
+    assert x is b
+    assert info == 0
+
+
+# linalg.svd
+
+gesdd_sizes = [(100, 5), (1000, 222)]
+
+
+def run_gesdd(a, lwork, func):
+    res = func(a, lwork=lwork, full_matrices=False, overwrite_a=False)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['s', 'd'])
+@pytest.mark.parametrize('mn', gesdd_sizes)
+def test_gesdd(benchmark, mn, variant):
+    m, n = mn
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+
+    a = np.array(rndm.uniform(size=(m, n)), dtype=dtyp, order='F')
+
+    gesdd_lwork = ow.get_func('gesdd_lwork', variant)
+
+    lwork, info = gesdd_lwork(m, n)
+    lwork = int(lwork)
+    assert info == 0
+
+    gesdd = ow.get_func('gesdd', variant)
+    u, s, vt, info = benchmark(run_gesdd, a, lwork, gesdd)
+
+    assert info == 0
+
+    atol = {'s': 1e-5, 'd': 1e-13}
+    np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=atol[variant])
+
+
+# linalg.eigh
+
+syev_sizes = [50, 200]
+
+
+def run_syev(a, lwork, func):
+    res = func(a, lwork=lwork, overwrite_a=True)
+    return res
+
+
+@pytest.mark.parametrize('variant', ['s', 'd'])
+@pytest.mark.parametrize('n', syev_sizes)
+def test_syev(benchmark, n, variant):
+    rndm = np.random.RandomState(1234)
+    dtyp = dtype_map[variant]
+
+    a = rndm.uniform(size=(n, n))
+    a = np.asarray(a + a.T, dtype=dtyp, order='F')
+    a_ = a.copy()
+
+    dsyev_lwork = ow.get_func('syev_lwork', variant)
+    lwork, info = dsyev_lwork(n)
+    lwork = int(lwork)
+    assert info == 0
+
+    syev = ow.get_func('syev', variant)
+    w, v, info = benchmark(run_syev, a, lwork, syev)
+
+    assert info == 0
+    assert a is v  # overwrite_a=True
+
+
--- a/benchmark/pybench/meson.build
+++ b/benchmark/pybench/meson.build
@ -0,0 +1,48 @@
+#
+# Taken from SciPy (of course)
+#
+project(
+  'openblas-wrap',
+  'c', 'fortran',
+  version: '0.1',
+  license: 'BSD-3',
+  meson_version: '>= 1.1.0',
+  default_options: [
+    'buildtype=debugoptimized',
+    'b_ndebug=if-release',
+    'c_std=c17',
+    'fortran_std=legacy',
+  ],
+)
+
+py3 = import('python').find_installation(pure: false)
+py3_dep = py3.dependency()
+
+cc = meson.get_compiler('c')
+
+_global_c_args = cc.get_supported_arguments(
+  '-Wno-unused-but-set-variable',
+  '-Wno-unused-function',
+  '-Wno-conversion',
+  '-Wno-misleading-indentation',
+)
+add_project_arguments(_global_c_args, language : 'c')
+
+# We need -lm for all C code (assuming it uses math functions, which is safe to
+# assume for SciPy). For C++ it isn't needed, because libstdc++/libc++ is
+# guaranteed to depend on it. For Fortran code, Meson already adds `-lm`.
+m_dep = cc.find_library('m', required : false)
+if m_dep.found()
+  add_project_link_arguments('-lm', language : 'c')
+endif
+
+generate_f2pymod = find_program('openblas_wrap/generate_f2pymod.py')
+
+openblas = dependency('openblas', method: 'pkg-config', required: true)
+openblas_dep = declare_dependency(
+  dependencies: openblas,
+  compile_args: []
+)
+
+
+subdir('openblas_wrap')
--- a/benchmark/pybench/openblas_wrap/init.py
+++ b/benchmark/pybench/openblas_wrap/init.py
@ -0,0 +1,17 @@
+"""
+Trampoline to hide the LAPACK details (scipy.lapack.linalg or scipy_openblas32 or...)
+from benchmarking.
+"""
+
+__version__ = "0.1"  
+
+
+from . import _flapack
+
+PREFIX = ''
+
+
+def get_func(name, variant):
+    """get_func('gesv', 'c') -> cgesv etc."""
+    return getattr(_flapack, PREFIX + variant + name)
+
--- a/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src
+++ b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src
@ -0,0 +1,417 @@
+!
+! Taken from scipy/linalg
+!
+! Shorthand notations
+!
+! <tchar=s,d,cs,zd>
+! <tchar2c=cs,zd>
+!
+! <prefix2=s,d>
+! <prefix2c=c,z>
+! <prefix3=s,sc>
+! <prefix4=d,dz>
+! <prefix6=s,d,c,z,c,z>
+!
+! <ftype2=real,double precision>
+! <ftype2c=complex,double complex>
+! <ftype3=real,complex>
+! <ftype4=double precision,double complex>
+! <ftypereal3=real,real>
+! <ftypereal4=double precision,double precision>
+! <ftype6=real,double precision,complex,double complex,\2,\3>
+! <ftype6creal=real,double precision,complex,double complex,\0,\1>
+!
+! <ctype2=float,double>
+! <ctype2c=complex_float,complex_double>
+! <ctype3=float,complex_float>
+! <ctype4=double,complex_double>
+! <ctypereal3=float,float>
+! <ctypereal4=double,double>
+! <ctype6=float,double,complex_float,complex_double,\2,\3>
+! <ctype6creal=float,double,complex_float,complex_double,\0,\1>
+!
+!
+! Level 1 BLAS
+!
+
+
+python module _flapack
+    usercode '''
+#define F_INT int
+'''
+
+interface
+
+
+subroutine <prefix>axpy(n,a,x,offx,incx,y,offy,incy)
+  ! Calculate z = a*x+y, where a is scalar.
+
+  callstatement (*f2py_func)(&n,&a,x+offx,&incx,y+offy,&incy)
+  callprotoargument F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*
+
+  <ftype> dimension(*), intent(in) :: x
+  <ftype> dimension(*), intent(in,out,out=z) :: y
+  <ftype> optional, intent(in):: a=<1.0,\0,(1.0\,0.0),\2>
+  integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
+  integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
+  integer optional, intent(in),depend(x) :: offx=0
+  integer optional, intent(in),depend(y) :: offy=0
+  check(offx>=0 && offx<len(x)) :: offx
+  check(offy>=0 && offy<len(y)) :: offy
+  integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
+       n = (len(x)-offx)/abs(incx)
+  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  check(len(y)-offy>(n-1)*abs(incy)) :: n
+
+end subroutine <prefix>axpy
+
+function ddot(n,x,offx,incx,y,offy,incy) result (xy)
+  ! Computes a vector-vector dot product.
+
+  callstatement ddot_return_value = (*f2py_func)(&n,x+offx,&incx,y+offy,&incy)
+  callprotoargument F_INT*,double*,F_INT*,double*,F_INT*
+  intent(c) ddot
+  fortranname F_FUNC(ddot,DDOT)
+
+  double precision dimension(*), intent(in) :: x
+  double precision dimension(*), intent(in) :: y
+  double precision ddot,xy
+  integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
+  integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
+  integer optional, intent(in),depend(x) :: offx=0
+  integer optional, intent(in),depend(y) :: offy=0
+  check(offx>=0 && offx<len(x)) :: offx
+  check(offy>=0 && offy<len(y)) :: offy
+  integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
+       n = (len(x)-offx)/abs(incx)
+  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  check(len(y)-offy>(n-1)*abs(incy)) :: n
+
+end function ddot
+
+
+function <prefix4>nrm2(n,x,offx,incx) result(n2)
+
+  <ftypereal4> <prefix4>nrm2, n2
+
+  callstatement <prefix4>nrm2_return_value = (*f2py_func)(&n,x+offx,&incx)
+  callprotoargument F_INT*,<ctype4>*,F_INT*
+  intent(c) <prefix4>nrm2
+  fortranname F_FUNC(<prefix4>nrm2,<D,DZ>NRM2)
+
+  <ftype4> dimension(*),intent(in) :: x
+
+  integer optional, intent(in),check(incx>0) :: incx = 1
+
+  integer optional,intent(in),depend(x) :: offx=0
+  check(offx>=0 && offx<len(x)) :: offx
+
+  integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
+  check(len(x)-offx>(n-1)*abs(incx)) :: n
+
+end function <prefix4>nrm2
+
+
+!
+! Level 2 BLAS
+!
+
+
+subroutine <prefix>gemv(m,n,alpha,a,x,beta,y,offx,incx,offy,incy,trans,rows,cols,ly)
+  ! Computes a matrix-vector product using a general matrix
+  !
+  ! y = gemv(alpha,a,x,beta=0,y=0,offx=0,incx=1,offy=0,incy=0,trans=0)
+  ! Calculate y <- alpha * op(A) * x + beta * y
+
+  callstatement (*f2py_func)((trans?(trans==2?"C":"T"):"N"),&m,&n,&alpha,a,&m, &
+       x+offx,&incx,&beta,y+offy,&incy)
+  callprotoargument char*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*,<ctype>*, &
+       <ctype>*,F_INT*
+
+  integer optional, intent(in), check(trans>=0 && trans <=2) :: trans = 0
+  integer optional, intent(in), check(incx>0||incx<0) :: incx = 1
+  integer optional, intent(in), check(incy>0||incy<0) :: incy = 1
+  <ftype> intent(in) :: alpha
+  <ftype> intent(in), optional :: beta = <0.0,\0,(0.0\,0.0),\2>
+
+  <ftype> dimension(*), intent(in) :: x
+  <ftype> dimension(ly), intent(in,copy,out), depend(ly),optional :: y
+  integer intent(hide), depend(incy,rows,offy) :: ly = &
+       (y_capi==Py_None?1+offy+(rows-1)*abs(incy):-1)
+  <ftype> dimension(m,n), intent(in) :: a
+  integer depend(a), intent(hide):: m = shape(a,0)
+  integer depend(a), intent(hide):: n = shape(a,1)
+
+  integer optional, intent(in) :: offx=0
+  integer optional, intent(in) :: offy=0
+  check(offx>=0 && offx<len(x)) :: x
+  check(len(x)>offx+(cols-1)*abs(incx)) :: x
+  depend(offx,cols,incx) :: x
+
+  check(offy>=0 && offy<len(y)) :: y
+  check(len(y)>offy+(rows-1)*abs(incy)) :: y
+  depend(offy,rows,incy) :: y
+
+  integer depend(m,n,trans), intent(hide) :: rows = (trans?n:m)
+  integer depend(m,n,trans), intent(hide) :: cols = (trans?m:n)
+
+end subroutine <prefix>gemv
+
+
+subroutine <prefix>gbmv(m,n,kl,ku,alpha,a,lda,x,incx,offx,beta,y,incy,offy,trans,ly)
+  ! Performs one of the matrix-vector operations
+  !
+  !    y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+  !                               or   y := alpha*A**H*x + beta*y,
+  !
+  ! where alpha and beta are scalars, x and y are vectors and A is an
+  ! m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+
+  callstatement (*f2py_func)((trans?(trans==2?"C":"T"):"N"),&m,&n,&kl,&ku,&alpha,a,&lda,x+offx,&incx,&beta,y+offy,&incy)
+  callprotoargument char*,F_INT*,F_INT*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*,<ctype>*,<ctype>*,F_INT*
+
+  integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0
+  integer intent(in), depend(ku,kl),check(m>=ku+kl+1) :: m
+  integer intent(in),check(n>=0&&n==shape(a,1)),depend(a) :: n
+  integer intent(in),check(kl>=0) :: kl
+  integer intent(in),check(ku>=0) :: ku
+  integer intent(hide),depend(a) :: lda = MAX(shape(a,0),1)
+  integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
+  integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
+  integer intent(hide),depend(m,n,incy,offy,trans) :: ly = &
+      (y_capi==Py_None?1+offy+(trans==0?m-1:n-1)*abs(incy):-1)
+  integer optional, intent(in) :: offx=0
+  integer optional, intent(in) :: offy=0
+
+  <ftype> intent(in) :: alpha
+  <ftype> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2>
+
+  <ftype> dimension(lda,n),intent(in) :: a
+
+  <ftype> dimension(ly), intent(in,out,copy,out=yout),depend(ly),optional :: y
+  check(offy>=0 && offy<len(y)) :: y
+  check(len(y)>offy+(trans==0?m-1:n-1)*abs(incy)) :: y
+  depend(offy,n,incy) :: y
+
+  <ftype> dimension(*), intent(in) :: x
+  check(offx>=0 && offx<len(x)) :: x
+  check(len(x)>offx+(trans==0?n-1:m-1)*abs(incx)) :: x
+  depend(offx,n,incx) :: x
+
+end subroutine <prefix>gbmv
+
+
+
+!
+! Level 3 BLAS
+!
+
+
+subroutine <prefix>gemm(m,n,k,alpha,a,b,beta,c,trans_a,trans_b,lda,ka,ldb,kb)
+  ! Computes a scalar-matrix-matrix product and adds the result to a
+  ! scalar-matrix product.
+  !
+  ! c = gemm(alpha,a,b,beta=0,c=0,trans_a=0,trans_b=0,overwrite_c=0)
+  ! Calculate C <- alpha * op(A) * op(B) + beta * C
+
+  callstatement (*f2py_func)((trans_a?(trans_a==2?"C":"T"):"N"), &
+       (trans_b?(trans_b==2?"C":"T"):"N"),&m,&n,&k,&alpha,a,&lda,b,&ldb,&beta,c,&m)
+  callprotoargument char*,char*,F_INT*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*, &
+       F_INT*,<ctype>*,<ctype>*,F_INT*
+
+  integer optional,intent(in),check(trans_a>=0 && trans_a <=2) :: trans_a = 0
+  integer optional,intent(in),check(trans_b>=0 && trans_b <=2) :: trans_b = 0
+  <ftype> intent(in) :: alpha
+  <ftype> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2>
+
+  <ftype> dimension(lda,ka),intent(in) :: a
+  <ftype> dimension(ldb,kb),intent(in) :: b
+  <ftype> dimension(m,n),intent(in,out,copy),depend(m,n),optional :: c
+  check(shape(c,0)==m && shape(c,1)==n) :: c
+
+  integer depend(a),intent(hide) :: lda = shape(a,0)
+  integer depend(a),intent(hide) :: ka = shape(a,1)
+  integer depend(b),intent(hide) :: ldb = shape(b,0)
+  integer depend(b),intent(hide) :: kb = shape(b,1)
+
+  integer depend(a,trans_a,ka,lda),intent(hide):: m = (trans_a?ka:lda)
+  integer depend(a,trans_a,ka,lda),intent(hide):: k = (trans_a?lda:ka)
+  integer depend(b,trans_b,kb,ldb,k),intent(hide),check(trans_b?kb==k:ldb==k) :: &
+       n = (trans_b?ldb:kb)
+
+end subroutine <prefix>gemm
+
+
+subroutine <prefix6><sy,\0,\0,\0,he,he>rk(n,k,alpha,a,beta,c,trans,lower,lda,ka)
+  !  performs one of the symmetric rank k operations
+  !     C := alpha*A*A**T + beta*C,  or   C := alpha*A**T*A + beta*C,
+  !
+  ! c = syrk(alpha,a,beta=0,c=0,trans=0,lower=0,overwrite_c=0)
+  !
+  callstatement (*f2py_func)((lower?"L":"U"), &
+        (trans?(trans==2?"C":"T"):"N"), &n,&k,&alpha,a,&lda,&beta,c,&n)
+  callprotoargument char*,char*,F_INT*,F_INT*,<ctype6>*,<ctype6>*,F_INT*,<ctype6>*, &
+        <ctype6>*,F_INT*
+
+  integer optional, intent(in),check(lower==0||lower==1) :: lower = 0
+  integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0
+
+  <ftype6> intent(in) :: alpha
+  <ftype6> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2,\2,\2>
+
+  <ftype6> dimension(lda,ka),intent(in) :: a
+  <ftype6> dimension(n,n),intent(in,out,copy),depend(n),optional :: c
+  check(shape(c,0)==n && shape(c,1)==n) :: c
+
+  integer depend(a),intent(hide) :: lda = shape(a,0)
+  integer depend(a),intent(hide) :: ka = shape(a,1)
+
+  integer depend(a, trans, ka, lda), intent(hide) :: n = (trans ? ka : lda)
+  integer depend(a, trans, ka, lda), intent(hide) :: k = (trans ? lda : ka)
+
+end subroutine <prefix6><sy,\0,\0,\0,he,he>rk
+
+
+!
+! LAPACK
+!
+
+subroutine <prefix>gesv(n,nrhs,a,piv,b,info)
+    ! lu,piv,x,info = gesv(a,b,overwrite_a=0,overwrite_b=0)
+    ! Solve A * X = B.
+    ! A = P * L * U
+    ! U is upper diagonal triangular, L is unit lower triangular,
+    ! piv pivots columns.
+
+    callstatement {F_INT i;(*f2py_func)(&n,&nrhs,a,&n,piv,b,&n,&info);for(i=0;i\<n;--piv[i++]);}
+    callprotoargument F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*
+
+    integer depend(a),intent(hide):: n = shape(a,0)
+    integer depend(b),intent(hide):: nrhs = shape(b,1)
+    <ftype> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
+    integer dimension(n),depend(n),intent(out) :: piv
+    <ftype> dimension(n,nrhs),check(shape(a,0)==shape(b,0)),depend(n) :: b
+    integer intent(out)::info
+    intent(in,out,copy,out=x) b
+    intent(in,out,copy,out=lu) a
+end subroutine <prefix>gesv
+
+
+subroutine <prefix2>gesdd(m,n,minmn,u0,u1,vt0,vt1,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
+    ! u,s,vt,info = gesdd(a,compute_uv=1,lwork=..,overwrite_a=0)
+    ! Compute the singular value decomposition (SVD) using divide and conquer:
+    !   A = U * SIGMA * transpose(V)
+    ! A - M x N matrix
+    ! U - M x M matrix or min(M,N) x N if full_matrices=False
+    ! SIGMA - M x N zero matrix with a main diagonal filled with min(M,N)
+    !               singular values
+    ! transpose(V) - N x N matrix or N x min(M,N) if full_matrices=False
+
+    callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,a,&m,s,u,&u0,vt,&vt0,work,&lwork,iwork,&info)
+    callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*
+
+    integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
+    integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
+    integer intent(hide),depend(a):: m = shape(a,0)
+    integer intent(hide),depend(a):: n = shape(a,1)
+    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
+    integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
+    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: u1 = (compute_uv?(full_matrices?m:minmn):1)
+    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
+    integer intent(hide),depend(compute_uv,minmn) :: vt1 = (compute_uv?n:1)
+    <ftype2> dimension(m,n),intent(in,copy,aligned8) :: a
+    <ftype2> dimension(minmn),intent(out),depend(minmn) :: s
+    <ftype2> dimension(u0,u1),intent(out),depend(u0, u1) :: u
+    <ftype2> dimension(vt0,vt1),intent(out),depend(vt0, vt1) :: vt
+    <ftype2> dimension(lwork),intent(hide,cache),depend(lwork) :: work
+    integer optional,intent(in),depend(minmn,compute_uv) &
+        :: lwork = max((compute_uv?4*minmn*minmn+MAX(m,n)+9*minmn:MAX(14*minmn+4,10*minmn+2+25*(25+8))+MAX(m,n)),1)
+    integer intent(hide,cache),dimension(8*minmn),depend(minmn) :: iwork
+    integer intent(out)::info
+
+end subroutine <prefix2>gesdd
+
+subroutine <prefix2>gesdd_lwork(m,n,minmn,u0,vt0,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
+    ! LWORK computation for (S/D)GESDD
+
+    fortranname <prefix2>gesdd
+    callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,&a,&m,&s,&u,&u0,&vt,&vt0,&work,&lwork,&iwork,&info)
+    callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*
+
+    integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
+    integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
+    integer intent(in) :: m
+    integer intent(in) :: n
+    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
+    integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
+    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
+    <ftype2> intent(hide) :: a
+    <ftype2> intent(hide) :: s
+    <ftype2> intent(hide) :: u
+    <ftype2> intent(hide) :: vt
+    <ftype2> intent(out) :: work
+    integer intent(hide) :: lwork = -1
+    integer intent(hide) :: iwork
+    integer intent(out) :: info
+
+end subroutine <prefix2>gesdd_lwork
+
+
+subroutine <prefix2>syev(compute_v,lower,n,w,a,lda,work,lwork,info)
+    ! w,v,info = syev(a,compute_v=1,lower=0,lwork=3*n-1,overwrite_a=0)
+    ! Compute all eigenvalues and, optionally, eigenvectors of a
+    ! real symmetric matrix A.
+    !
+    ! Performance tip:
+    !   If compute_v=0 then set also overwrite_a=1.
+
+    callstatement (*f2py_func)((compute_v?"V":"N"),(lower?"L":"U"),&n,a,&lda,w,work,&lwork,&info)
+    callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*
+
+    integer optional,intent(in):: compute_v = 1
+    check(compute_v==1||compute_v==0) compute_v
+    integer optional,intent(in),check(lower==0||lower==1) :: lower = 0
+
+    integer intent(hide),depend(a):: n = shape(a,0)
+    integer intent(hide),depend(a):: lda = MAX(1,shape(a,0))
+    <ftype2> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
+    intent(in,copy,out,out=v) :: a
+
+    <ftype2> dimension(n),intent(out),depend(n) :: w
+
+    integer optional,intent(in),depend(n) :: lwork=max(3*n-1,1)
+    check(lwork>=3*n-1) :: lwork
+    <ftype2> dimension(lwork),intent(hide),depend(lwork) :: work
+
+    integer intent(out) :: info
+
+end subroutine <prefix2>syev
+
+
+subroutine <prefix2>syev_lwork(lower,n,w,a,lda,work,lwork,info)
+    ! LWORK routines for syev
+
+    fortranname <prefix2>syev
+
+    callstatement (*f2py_func)("N",(lower?"L":"U"),&n,&a,&lda,&w,&work,&lwork,&info)
+    callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*
+    
+     integer intent(in):: n
+     integer optional,intent(in),check(lower==0||lower==1) :: lower = 0
+     
+     integer intent(hide),depend(n):: lda = MAX(1, n)
+     <ftype2> intent(hide):: a
+     <ftype2> intent(hide):: w
+     integer intent(hide):: lwork = -1
+    
+     <ftype2> intent(out):: work
+     integer intent(out):: info
+     
+end subroutine <prefix2>syev_lwork
+
+end interface
+
+end python module _flapack
+
+
+
--- a/benchmark/pybench/openblas_wrap/generate_f2pymod.py
+++ b/benchmark/pybench/openblas_wrap/generate_f2pymod.py
@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""
+Process f2py template files (`filename.pyf.src` -> `filename.pyf`)
+
+Usage: python generate_pyf.py filename.pyf.src -o filename.pyf
+"""
+
+import os
+import sys
+import re
+import subprocess
+import argparse
+
+
+# START OF CODE VENDORED FROM `numpy.distutils.from_template`
+#############################################################
+"""
+process_file(filename)
+
+  takes templated file .xxx.src and produces .xxx file where .xxx
+  is .pyf .f90 or .f using the following template rules:
+
+  '<..>' denotes a template.
+
+  All function and subroutine blocks in a source file with names that
+  contain '<..>' will be replicated according to the rules in '<..>'.
+
+  The number of comma-separated words in '<..>' will determine the number of
+  replicates.
+
+  '<..>' may have two different forms, named and short. For example,
+
+  named:
+   <p=d,s,z,c> where anywhere inside a block '<p>' will be replaced with
+   'd', 's', 'z', and 'c' for each replicate of the block.
+
+   <_c>  is already defined: <_c=s,d,c,z>
+   <_t>  is already defined: <_t=real,double precision,complex,double complex>
+
+  short:
+   <s,d,c,z>, a short form of the named, useful when no <p> appears inside
+   a block.
+
+  In general, '<..>' contains a comma separated list of arbitrary
+  expressions. If these expression must contain a comma|leftarrow|rightarrow,
+  then prepend the comma|leftarrow|rightarrow with a backslash.
+
+  If an expression matches '\\<index>' then it will be replaced
+  by <index>-th expression.
+
+  Note that all '<..>' forms in a block must have the same number of
+  comma-separated entries.
+
+ Predefined named template rules:
+  <prefix=s,d,c,z>
+  <ftype=real,double precision,complex,double complex>
+  <ftypereal=real,double precision,\\0,\\1>
+  <ctype=float,double,complex_float,complex_double>
+  <ctypereal=float,double,\\0,\\1>
+"""
+
+routine_start_re = re.compile(
+    r'(\n|\A)((     (\$|\*))|)\s*(subroutine|function)\b',
+    re.I
+)
+routine_end_re = re.compile(r'\n\s*end\s*(subroutine|function)\b.*(\n|\Z)', re.I)
+function_start_re = re.compile(r'\n     (\$|\*)\s*function\b', re.I)
+
+def parse_structure(astr):
+    """ Return a list of tuples for each function or subroutine each
+    tuple is the start and end of a subroutine or function to be
+    expanded.
+    """
+
+    spanlist = []
+    ind = 0
+    while True:
+        m = routine_start_re.search(astr, ind)
+        if m is None:
+            break
+        start = m.start()
+        if function_start_re.match(astr, start, m.end()):
+            while True:
+                i = astr.rfind('\n', ind, start)
+                if i==-1:
+                    break
+                start = i
+                if astr[i:i+7]!='\n     $':
+                    break
+        start += 1
+        m = routine_end_re.search(astr, m.end())
+        ind = end = m and m.end()-1 or len(astr)
+        spanlist.append((start, end))
+    return spanlist
+
+template_re = re.compile(r"<\s*(\w[\w\d]*)\s*>")
+named_re = re.compile(r"<\s*(\w[\w\d]*)\s*=\s*(.*?)\s*>")
+list_re = re.compile(r"<\s*((.*?))\s*>")
+
+def find_repl_patterns(astr):
+    reps = named_re.findall(astr)
+    names = {}
+    for rep in reps:
+        name = rep[0].strip() or unique_key(names)
+        repl = rep[1].replace(r'\,', '@comma@')
+        thelist = conv(repl)
+        names[name] = thelist
+    return names
+
+def find_and_remove_repl_patterns(astr):
+    names = find_repl_patterns(astr)
+    astr = re.subn(named_re, '', astr)[0]
+    return astr, names
+
+item_re = re.compile(r"\A\\(?P<index>\d+)\Z")
+def conv(astr):
+    b = astr.split(',')
+    l = [x.strip() for x in b]
+    for i in range(len(l)):
+        m = item_re.match(l[i])
+        if m:
+            j = int(m.group('index'))
+            l[i] = l[j]
+    return ','.join(l)
+
+def unique_key(adict):
+    """ Obtain a unique key given a dictionary."""
+    allkeys = list(adict.keys())
+    done = False
+    n = 1
+    while not done:
+        newkey = '__l%s' % (n)
+        if newkey in allkeys:
+            n += 1
+        else:
+            done = True
+    return newkey
+
+
+template_name_re = re.compile(r'\A\s*(\w[\w\d]*)\s*\Z')
+def expand_sub(substr, names):
+    substr = substr.replace(r'\>', '@rightarrow@')
+    substr = substr.replace(r'\<', '@leftarrow@')
+    lnames = find_repl_patterns(substr)
+    substr = named_re.sub(r"<\1>", substr)  # get rid of definition templates
+
+    def listrepl(mobj):
+        thelist = conv(mobj.group(1).replace(r'\,', '@comma@'))
+        if template_name_re.match(thelist):
+            return "<%s>" % (thelist)
+        name = None
+        for key in lnames.keys():    # see if list is already in dictionary
+            if lnames[key] == thelist:
+                name = key
+        if name is None:      # this list is not in the dictionary yet
+            name = unique_key(lnames)
+            lnames[name] = thelist
+        return "<%s>" % name
+
+    substr = list_re.sub(listrepl, substr) # convert all lists to named templates
+                                           # newnames are constructed as needed
+
+    numsubs = None
+    base_rule = None
+    rules = {}
+    for r in template_re.findall(substr):
+        if r not in rules:
+            thelist = lnames.get(r, names.get(r, None))
+            if thelist is None:
+                raise ValueError('No replicates found for <%s>' % (r))
+            if r not in names and not thelist.startswith('_'):
+                names[r] = thelist
+            rule = [i.replace('@comma@', ',') for i in thelist.split(',')]
+            num = len(rule)
+
+            if numsubs is None:
+                numsubs = num
+                rules[r] = rule
+                base_rule = r
+            elif num == numsubs:
+                rules[r] = rule
+            else:
+                print("Mismatch in number of replacements (base <{}={}>) "
+                      "for <{}={}>. Ignoring."
+                      .format(base_rule, ','.join(rules[base_rule]), r, thelist))
+    if not rules:
+        return substr
+
+    def namerepl(mobj):
+        name = mobj.group(1)
+        return rules.get(name, (k+1)*[name])[k]
+
+    newstr = ''
+    for k in range(numsubs):
+        newstr += template_re.sub(namerepl, substr) + '\n\n'
+
+    newstr = newstr.replace('@rightarrow@', '>')
+    newstr = newstr.replace('@leftarrow@', '<')
+    return newstr
+
+def process_str(allstr):
+    newstr = allstr
+    writestr = ''
+
+    struct = parse_structure(newstr)
+
+    oldend = 0
+    names = {}
+    names.update(_special_names)
+    for sub in struct:
+        cleanedstr, defs = find_and_remove_repl_patterns(newstr[oldend:sub[0]])
+        writestr += cleanedstr
+        names.update(defs)
+        writestr += expand_sub(newstr[sub[0]:sub[1]], names)
+        oldend =  sub[1]
+    writestr += newstr[oldend:]
+
+    return writestr
+
+include_src_re = re.compile(
+    r"(\n|\A)\s*include\s*['\"](?P<name>[\w\d./\\]+\.src)['\"]",
+    re.I
+)
+
+def resolve_includes(source):
+    d = os.path.dirname(source)
+    with open(source) as fid:
+        lines = []
+        for line in fid:
+            m = include_src_re.match(line)
+            if m:
+                fn = m.group('name')
+                if not os.path.isabs(fn):
+                    fn = os.path.join(d, fn)
+                if os.path.isfile(fn):
+                    lines.extend(resolve_includes(fn))
+                else:
+                    lines.append(line)
+            else:
+                lines.append(line)
+    return lines
+
+def process_file(source):
+    lines = resolve_includes(source)
+    return process_str(''.join(lines))
+
+_special_names = find_repl_patterns('''
+<_c=s,d,c,z>
+<_t=real,double precision,complex,double complex>
+<prefix=s,d,c,z>
+<ftype=real,double precision,complex,double complex>
+<ctype=float,double,complex_float,complex_double>
+<ftypereal=real,double precision,\\0,\\1>
+<ctypereal=float,double,\\0,\\1>
+''')
+
+# END OF CODE VENDORED FROM `numpy.distutils.from_template`
+###########################################################
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("infile", type=str,
+                        help="Path to the input file")
+    parser.add_argument("-o", "--outdir", type=str,
+                        help="Path to the output directory")
+    args = parser.parse_args()
+
+    if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')):
+        raise ValueError(f"Input file has unknown extension: {args.infile}")
+
+    outdir_abs = os.path.join(os.getcwd(), args.outdir)
+
+    # Write out the .pyf/.f file
+    if args.infile.endswith(('.pyf.src', '.f.src')):
+        code = process_file(args.infile)
+        fname_pyf = os.path.join(args.outdir,
+                                 os.path.splitext(os.path.split(args.infile)[1])[0])
+
+        with open(fname_pyf, 'w') as f:
+            f.write(code)
+    else:
+        fname_pyf = args.infile
+
+    # Now invoke f2py to generate the C API module file
+    if args.infile.endswith(('.pyf.src', '.pyf')):
+        p = subprocess.Popen([sys.executable, '-m', 'numpy.f2py', fname_pyf,
+                            '--build-dir', outdir_abs], #'--quiet'],
+                            stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                            cwd=os.getcwd())
+        out, err = p.communicate()
+        if not (p.returncode == 0):
+            raise RuntimeError(f"Writing {args.outfile} with f2py failed!\n"
+                            f"{out}\n"
+                            r"{err}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/pybench/openblas_wrap/meson.build
+++ b/benchmark/pybench/openblas_wrap/meson.build
@ -0,0 +1,50 @@
+# find numpy & f2py includes
+inc_numpy = run_command(py3,
+  ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'],
+  check : true
+).stdout().strip()
+
+inc_f2py = run_command(py3,
+    ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'],
+    check : true
+).stdout().strip()
+
+
+inc_np = include_directories(inc_numpy, inc_f2py)
+fortranobject_c = inc_f2py / 'fortranobject.c'
+
+
+fortranobject_lib = static_library('_fortranobject',
+  fortranobject_c,
+#  c_args: numpy_nodepr_api,
+  dependencies: py3_dep,
+  include_directories: [inc_np, inc_f2py],
+  gnu_symbol_visibility: 'hidden',
+)
+fortranobject_dep = declare_dependency(
+  link_with: fortranobject_lib,
+  include_directories: [inc_np, inc_f2py],
+)
+
+
+# f2py generated wrappers
+
+flapack_module = custom_target('flapack_module',
+  output: ['_flapackmodule.c'],
+  input: 'blas_lapack.pyf.src',
+  command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'],
+)
+
+py3.extension_module('_flapack',
+  flapack_module,
+  link_args: [],  # version_link_args,
+  dependencies: [openblas_dep, fortranobject_dep],
+  install: true,
+  subdir: 'openblas_wrap'
+)
+
+
+py3.install_sources(
+  ['__init__.py'],
+  subdir: 'openblas_wrap'
+)
--- a/benchmark/pybench/scipy_openblas.pc
+++ b/benchmark/pybench/scipy_openblas.pc
@ -0,0 +1,12 @@
+libdir=/home/br/repos/OpenBLAS/
+includedir=/home/br/repos/OpenBLAS/
+openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
+version=0.3.27
+extralib=-lm -lpthread -lgfortran -lquadmath -L${libdir} -lopenblas
+Name: openblas
+Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
+Version: ${version}
+URL: https://github.com/xianyi/OpenBLAS
+Libs: -L${libdir} -lopenblas
+Libs.private: ${extralib}
+Cflags: -I${includedir}
--- a/18
+++ b/18
@ -197,10 +197,22 @@ fi
 no_lsx=0
 no_lasx=0
 if [ "$architecture" = "loongarch64" ]; then
+    lasx_flags='-march=loongarch64'
+    lsx_flags='-march=loongarch64'
+
    tmpd="$(mktemp -d)"
+    tmparch="$tmpd/arch.c"
+    printf "void main(void){ }\n" >> "$tmparch"
+    args="-march=loongarch64 -o $tmparch.o $tmparch"
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        lasx_flags=''
+        lsx_flags=''
+    }
+
    tmplsx="$tmpd/lsx.c"
    codelsx='"vadd.b $vr0, $vr0, $vr0"'
-    lsx_flags='-march=loongarch64'
    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
    args="$lsx_flags -o $tmplsx.o $tmplsx"
    {
@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then

    tmplasx="$tmpd/lasx.c"
    codelasx='"xvadd.b $xr0, $xr0, $xr0"'
-    lasx_flags='-march=loongarch64'
    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
    args="$lasx_flags -o $tmplasx.o $tmplasx"
    {
@ -345,6 +356,9 @@ if [ "$compiler" = "GCC" ]; then
        no_avx2=0
        oldgcc=0
        data=`$compiler_name -dumpversion`
+        case "$data" in *-*)
+            data="${data%-*}"
+        esac
        case "$data" in *.*.*)
            data="${data%.*}"
        esac
--- a/cblas.h
+++ b/cblas.h
@ -26,6 +26,11 @@ char* openblas_get_config(void);
 /*Get the CPU corename on runtime.*/
 char* openblas_get_corename(void);

+/*Set the threading backend to a custom callback.*/
+typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
+typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
+void openblas_set_threads_callback_function(openblas_threads_callback callback);
+
 #ifdef OPENBLAS_OS_LINUX
 /* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
 int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
@ -402,15 +407,27 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
 void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 

-void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, 
+void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
-void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, 
+void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 
-void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, 
+void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
-void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, 
+void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 

+void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
+		       OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST float ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST float ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
+
+void cblas_dgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
+		       OPENBLAS_CONST double * alpha_array, OPENBLAS_CONST double ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST double ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST double * beta_array, double ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
+
+void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
+		       OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
+
+void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
+		       OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
+
 /*** BFLOAT16 and INT8 extensions ***/
 /* convert float array to BFLOAT16 array by rounding */
 void   cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
@ -426,6 +443,9 @@ void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum

 void   cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		    OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
+		       OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
+
 #ifdef __cplusplus
 }
 #endif  /* __cplusplus */
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
  if (ARM64)
 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
    if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
+	    set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
    endif ()
    if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
@ -57,7 +57,11 @@ if (DYNAMIC_ARCH)
 	  set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
 	  set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
  endif ()
-  
+ 
+  if (RISCV64)
+	  set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B) 
+  endif ()
+
  if (X86)
    set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
  endif ()
@ -90,6 +94,10 @@ if (DYNAMIC_ARCH)
    endif ()
  endif ()

+  if (LOONGARCH64)
+    set(DYNAMIC_CORE LA64_GENERIC LA264 LA464)
+  endif ()
+
  if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)
 	  message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again")
  endif ()
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@ -2,12 +2,18 @@
 ## Author: Hank Anderson <hank@statease.com>
 ## Description: Ported from portion of OpenBLAS/Makefile.system
 ##              Sets C related variables.
+include(CheckCCompilerFlag)
+
+if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM")
+  set(CCOMMON_OPT "${CCOMMON_OPT} -fp-model=consistent")
+  set(GCC_VERSION 100)
+endif ()

 if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
-
  set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
  set(COMMON_PROF "${COMMON_PROF} -fno-inline")
  set(NO_UNINITIALIZED_WARN "-Wno-uninitialized")
+  set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION})

  if (QUIET_MAKE)
    set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused")
@ -36,14 +42,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS

    if (LOONGARCH64)
      if (BINARY64)
-	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+	CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
        if(COMPILER_SUPPORT_LP64D_ABI)
 	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
 	else()
 	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
 	endif ()
      else ()
-	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
 	if(COMPILER_SUPPORT_ILP32D_ABI)
 	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
 	else()
@ -139,7 +145,6 @@ endif ()
 if (${CORE} STREQUAL COOPERLAKE)
  if (NOT DYNAMIC_ARCH)
    if (NOT NO_AVX512)
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
      if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
        set (CCOMMON_OPT  "${CCOMMON_OPT} -march=cooperlake")
      else ()
@ -152,7 +157,6 @@ endif ()
 if (${CORE} STREQUAL SAPPHIRERAPIDS)
  if (NOT DYNAMIC_ARCH)
    if (NOT NO_AVX512)
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
      if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
        set (CCOMMON_OPT  "${CCOMMON_OPT} -march=sapphirerapids")
      else ()
@ -166,7 +170,6 @@ if (${CORE} STREQUAL ZEN)
  if (HAVE_AVX512VL)
    if (NOT DYNAMIC_ARCH)
      if (NOT NO_AVX512)
-        execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
        if (${GCC_VERSION} VERSION_GREATER 13.0 OR ${GCC_VERSION} VERSION_EQUAL 13.0)
          set (CCOMMON_OPT  "${CCOMMON_OPT} -march=znver4")
        else ()
@ -179,7 +182,6 @@ endif ()

 if (${CORE} STREQUAL A64FX)
  if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
    if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
    else ()
@ -193,7 +195,6 @@ if (${CORE} STREQUAL NEOVERSEN2)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
    else ()
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
      else ()
@ -208,7 +209,6 @@ if (${CORE} STREQUAL NEOVERSEV1)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
    else ()
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
        set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
      else ()
@ -220,7 +220,6 @@ endif ()

 if (${CORE} STREQUAL NEOVERSEN1)
  if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
    if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
    else ()
@ -265,23 +264,21 @@ endif ()

 if (${CORE} STREQUAL POWER10)
  if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
    if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
      set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
    else ()
-      message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." )
+      message(FATAL_ERROR "Compiler GCC ${GCC_VERSION} does not support Power10." )
    endif()
  endif ()
 endif ()

 if (${CORE} STREQUAL POWER9)
  if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
    if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
      set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
    else ()
      set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
-      message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.")
+      message(WARNING "Compiler GCC ${GCC_VERSION} does not fully support Power9.")
    endif ()
  endif ()
 endif ()
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@ -61,18 +61,25 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
    endif ()
    if (LOONGARCH64)
      if (BINARY64)
-	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
-        if(COMPILER_SUPPORT_LP64D_ABI)
-	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
-	else()
-	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
-	endif ()
+        if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
+	  CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+          if(COMPILER_SUPPORT_LP64D_ABI)
+	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
+	  else()
+	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	  endif ()
+        endif ()
+        if (INTERFACE64) 
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") 
+        endif () 
      else ()
-	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
-	if(COMPILER_SUPPORT_ILP32D_ABI)
-	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
-	else()
-	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+        if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
+	  CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	  if(COMPILER_SUPPORT_ILP32D_ABI)
+	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
+	  else()
+	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+          endif ()
 	endif ()
      endif ()
    endif ()
@ -114,12 +121,12 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
  endif ()
 endif ()

-if (${F_COMPILER} STREQUAL "INTEL")
+if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel")
  set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL")
  if (INTERFACE64)
    set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
  endif ()
-  set(FCOMMON_OPT "${FCOMMON_OPT} -recursive")
+  set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent")
  if (USE_OPENMP)
    set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
  endif ()
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@ -9,5 +9,5 @@ Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: @OpenBLAS_VERSION@
 URL: https://github.com/OpenMathLib/OpenBLAS
-Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} 
-Cflags: -I${includedir}
+Libs: -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} 
+Cflags: -I${includedir} @OpenMP_C_FLAGS@ 
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@ -38,7 +38,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")

    # Test for supporting MS_ABI
    # removed string parsing in favor of CMake's version comparison -hpa
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION})
    if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
      # GCC Version >=4.7
      # It is compatible with MSVC ABI.
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@ -1218,6 +1218,37 @@ endif ()
    set(ZGEMM_UNROLL_M 4)
    set(ZGEMM_UNROLL_N 4)
    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "A64FX")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_CODE_SIZE\t65536\n"
+      "#define L1_CODE_LINESIZE\t256\n"
+      "#define L1_CODE_ASSOCIATIVE\t8\n"
+      "#define L1_DATA_SIZE\t32768\n"
+      "#define L1_DATA_LINESIZE\t256\n"
+      "#define L1_DATA_ASSOCIATIVE\t8\n"
+      "#define L2_SIZE\t8388608\n\n"
+      "#define L2_LINESIZE\t256\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define L3_SIZE\t0\n\n"
+      "#define L3_LINESIZE\t0\n\n"
+      "#define L3_ASSOCIATIVE\t0\n\n"
+      "#define DTB_DEFAULT_ENTRIES\t64\n"
+      "#define DTB_SIZE\t4096\n"
+      "#define HAVE_VFPV4\n"
+      "#define HAVE_VFPV3\n"
+      "#define HAVE_VFP\n"
+      "#define HAVE_NEON\n"
+      "#define HAVE_SVE\n"
+      "#define ARMV8\n")
+    set(SGEMM_UNROLL_M 4)
+    set(SGEMM_UNROLL_N 8)
+    set(DGEMM_UNROLL_M 2)
+    set(DGEMM_UNROLL_N 8)
+    set(CGEMM_UNROLL_M 2)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 2)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
  elseif ("${TCORE}" STREQUAL "P5600")
    file(APPEND ${TARGET_CONF_TEMP}
      "#define L2_SIZE 1048576\n"
@ -1309,6 +1340,63 @@ endif ()
      "#define DTB_DEFAULT_ENTRIES 128\n"
      "#define DTB_SIZE 4096\n"
      "#define L2_ASSOCIATIVE 8\n")
+  elseif ("${TCORE}" STREQUAL "RISCV64_GENERIC")
+    file(APPEND ${TARGET_CONF_TEMP}
+        "#define L1_DATA_SIZE 32768\n"
+      "#define L1_DATA_LINESIZE 32\n"
+      "#define L2_SIZE 1048576\n"
+      "#define L2_LINESIZE 32 \n"
+      "#define DTB_DEFAULT_ENTRIES 128\n"
+      "#define DTB_SIZE 4096\n"
+      "#define L2_ASSOCIATIVE 4\n")
+  elseif ("${TCORE}" STREQUAL "LA64_GENERIC")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define DTB_DEFAULT_ENTRIES 64\n")
+      set(SGEMM_UNROLL_M 2)
+      set(SGEMM_UNROLL_N 8)
+      set(DGEMM_UNROLL_M 2)
+      set(DGEMM_UNROLL_N 8)
+      set(CGEMM_UNROLL_M 1)
+      set(CGEMM_UNROLL_N 4)
+      set(ZGEMM_UNROLL_M 1)
+      set(ZGEMM_UNROLL_N 4)
+      set(CGEMM3M_UNROLL_M 2)
+      set(CGEMM3M_UNROLL_N 8)
+      set(ZGEMM3M_UNROLL_M 2)
+      set(ZGEMM3M_UNROLL_N 8)
+  elseif ("${TCORE}" STREQUAL "LA264")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define DTB_DEFAULT_ENTRIES 64\n")
+      set(HAVE_LSX  1)
+      set(SGEMM_UNROLL_M 2)
+      set(SGEMM_UNROLL_N 8)
+      set(DGEMM_UNROLL_M 8)
+      set(DGEMM_UNROLL_N 4)
+      set(CGEMM_UNROLL_M 8)
+      set(CGEMM_UNROLL_N 4)
+      set(ZGEMM_UNROLL_M 4)
+      set(ZGEMM_UNROLL_N 4)
+      set(CGEMM3M_UNROLL_M 2)
+      set(CGEMM3M_UNROLL_N 8)
+      set(ZGEMM3M_UNROLL_M 8)
+      set(ZGEMM3M_UNROLL_N 4)
+  elseif ("${TCORE}" STREQUAL "LA464")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define DTB_DEFAULT_ENTRIES 64\n")
+      set(HAVE_LASX 1)
+      set(HAVE_LSX  1)
+      set(SGEMM_UNROLL_M 16)
+      set(SGEMM_UNROLL_N 8)
+      set(DGEMM_UNROLL_M 16)
+      set(DGEMM_UNROLL_N 6)
+      set(CGEMM_UNROLL_M 16)
+      set(CGEMM_UNROLL_N 4)
+      set(ZGEMM_UNROLL_M 8)
+      set(ZGEMM_UNROLL_N 4)
+      set(CGEMM3M_UNROLL_M 16)
+      set(CGEMM3M_UNROLL_N 8)
+      set(ZGEMM3M_UNROLL_M 16)
+      set(ZGEMM3M_UNROLL_N 6)
  endif()
  set(SBGEMM_UNROLL_M 8)
  set(SBGEMM_UNROLL_N 4)
@ -1342,7 +1430,7 @@ else(NOT CMAKE_CROSSCOMPILING)

  if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
    #Use generic for MSVC now
-    message("MSVC")
+    message(STATUS "MSVC")
    set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
  else()
    list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -160,11 +160,16 @@ else()
  endif ()
 endif ()

+if (C_LAPACK)
+  if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+    set(CCOMMON_OPT "${CCOMMON_OPT} -Wno-error=incompatible-pointer-types")
+  endif ()
+endif ()
+
 include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
 if (DEFINED TARGET)
  if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
        if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
        else()
@ -180,7 +185,6 @@ if (DEFINED TARGET)
  endif()
  if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
        if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0)
          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
        else()
@ -203,7 +207,6 @@ if (DEFINED TARGET)
  
  if (((${TARGET} STREQUAL ZEN) AND HAVE_AVX512VL) AND NOT NO_AVX512)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
        if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.99)
          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4")
 	else()
@ -221,8 +224,7 @@ if (DEFINED TARGET)
  
  if ((${TARGET} STREQUAL HASWELL OR (${TARGET} STREQUAL ZEN AND NOT HAVE_AVX512VL)) AND NOT NO_AVX2)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-      if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
+	    if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 4.7 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 4.7)
        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
      endif()
    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
@ -261,20 +263,18 @@ if (DEFINED TARGET)
  endif()

  if (${TARGET} STREQUAL POWER10)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
+    if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
    else ()
-      message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.")
+	    message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.")
    endif()
  endif()
  if (${TARGET} STREQUAL POWER9)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
+    if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 5.0 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 5.0)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
    else ()
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
-      message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.")
+      message(WARNING "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support fully Power9.")
    endif()
  endif()
  if (${TARGET} STREQUAL POWER8)
@ -285,11 +285,10 @@ if (${TARGET} STREQUAL NEOVERSEV1)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
 	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
    else ()
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+    if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
    else ()
-	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
+	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.")
    endif()
    endif()
  endif()
@ -297,11 +296,10 @@ if (${TARGET} STREQUAL NEOVERSEV1)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
 	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
    else ()
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+    if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
    else ()
-	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
+	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse N2.")
    endif()
    endif()
  endif()
@ -312,6 +310,18 @@ if (${TARGET} STREQUAL NEOVERSEV1)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
    endif()
  endif()
+  if (${TARGET} STREQUAL A64FX)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
+    else ()
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+        set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve -mtune=a64fx")
+      else ()
+        message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support A64FX.")
+      endif()
+    endif()
+  endif()

 endif()

@ -378,15 +388,25 @@ if (NEED_PIC)
  endif()
 endif ()

-if (X86_64 OR ${CORE} STREQUAL POWER10)
+if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64 OR LOONGARCH64)
  set(SMALL_MATRIX_OPT TRUE)
 endif ()
+if (ARM64)
+  set(GEMM_GEMV_FORWARD TRUE)
+endif ()
+
+if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD")
+endif ()
+if (GEMM_GEMV_FORWARD_BF16 AND NOT ONLY_CBLAS)
+    set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD_BF16")
+endif ()
 if (SMALL_MATRIX_OPT)
  set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
 endif ()

 if (DYNAMIC_ARCH)
-  if (X86 OR X86_64 OR ARM64 OR POWER)
+  if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64)
    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
    if (DYNAMIC_OLDER)
      set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@ -604,7 +624,10 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}")
 set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}")

 #For LAPACK Fortran codes.
-set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}")
+set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" )
+if (LAPACK_STRLEN)
+	set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}")
+endif()
 set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")

 #Disable -fopenmp for LAPACK Fortran codes on Windows.
@ -617,7 +640,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
 endif ()

 if (CMAKE_Fortran_COMPILER)
-if (${F_COMPILER} STREQUAL "NAG" OR ${F_COMPILER} STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
+if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
  set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
  if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
 message(STATUS "removing fortran flags")
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@ -104,6 +104,8 @@ elseif(ARM)
  set(ARCH "arm")
 elseif(ARM64)
  set(ARCH "arm64")
+elseif(LOONGARCH64)
+  set(ARCH "loongarch64")
 else()
  set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture")
 endif ()
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -187,8 +187,8 @@ macro(ParseMakefileVars MAKEFILE_IN)
        set (HasValidGroup 1)
        set (STR ${CMAKE_MATCH_4})
      endif ()
-      if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1)
-        if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR}))
+      if (DEFINED CMAKE_MATCH_1 AND ${HasValidGroup} EQUAL 1)
+        if (NOT (CMAKE_MATCH_1 STREQUAL ${STR}))
          #message (STATUS "condition is true")
          set (IfElse 1)
          continue ()
--- a/common_arm.h
+++ b/common_arm.h
@ -47,8 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #endif

-#define INLINE inline
-
 #define RETURN_BY_COMPLEX

 #ifndef ASSEMBLER
--- a/common_arm64.h
+++ b/common_arm64.h
@ -44,8 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RMB  __asm__ __volatile__ ("dmb  ishld" : : : "memory")
 #endif

-#define INLINE inline
-
 #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
 #define RETURN_BY_STACK
 #else
@ -55,6 +53,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER


+#ifndef NO_AFFINITY
+static __inline int WhereAmI(void){
+  uint64_t ret;
+  __asm__ volatile (
+       "         mrs x0, mpidr_el1 \n"
+       "         and x0, x0, 0xff  \n"
+                 :"=r" (ret)
+                 :: "memory"
+               );
+  ret +=1;
+  if ((int)ret <0) ret = 0;
+  return (int)ret;
+}
+#endif
+
 static __inline void blas_lock(volatile BLASULONG *address){

  BLASULONG ret;
--- a/common_e2k.h
+++ b/common_e2k.h
@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
 #define RMB

-#define INLINE __attribute__((__always_inline__)) inline
-
 static inline int blas_quickdivide(blasint x, blasint y) {
  return x / y;
 }
--- a/common_interface.h
+++ b/common_interface.h
@ -47,6 +47,11 @@ int    BLASFUNC(xerbla)(char *, blasint *info, blasint);

 void    openblas_set_num_threads_(int *);

+/*Set the threading backend to a custom callback.*/
+typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
+typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
+extern openblas_threads_callback openblas_threads_callback_;
+
 FLOATRET  BLASFUNC(sdot)  (blasint *, float  *, blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(sdsdot)(blasint *, float  *,        float  *, blasint *, float  *, blasint *);

--- a/common_level3.h
+++ b/common_level3.h
@ -1937,8 +1937,13 @@ int zimatcopy_k_rtc(BLASLONG, BLASLONG,  double, double, double *, BLASLONG);
 int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); 
 int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); 
 int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG); 
-int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); 
+int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);

+int sgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
+int dgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
+int cgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
+int zgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
+int sbgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);

 #ifdef __CUDACC__
 }
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

-#define INLINE inline
-
 #ifndef ASSEMBLER

 static inline int blas_quickdivide(blasint x, blasint y){
@ -281,9 +279,13 @@ REALNAME: ;\
 #define GNUSTACK
 #endif /* defined(__linux__) && defined(__ELF__) */

+#ifdef __clang__
+#define EPILOGUE .end
+#else
 #define EPILOGUE      \
    .end    REALNAME ;\
    GNUSTACK
+#endif

 #define PROFCODE

--- a/common_macro.h
+++ b/common_macro.h
@ -2655,9 +2655,20 @@ typedef struct {
  BLASLONG prea, preb, prec, pred;
 #endif

+
+  //for gemm_batch
+  void * routine;
+  int routine_mode;
+
 } blas_arg_t;
 #endif

+#ifdef SMALL_MATRIX_OPT
+#define BLAS_SMALL_OPT  0x10000U
+#define BLAS_SMALL_B0_OPT  0x30000U
+#endif
+
+
 #ifdef XDOUBLE

 #define TRSV_NUU qtrsv_NUU
--- a/common_mips.h
+++ b/common_mips.h
@ -37,8 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

-#define INLINE inline
-
 #define RETURN_BY_COMPLEX

 #ifndef ASSEMBLER
--- a/common_mips64.h
+++ b/common_mips64.h
@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

-#define INLINE inline
-
 #ifndef ASSEMBLER

 static inline unsigned int rpcc(void){
--- a/common_power.h
+++ b/common_power.h
@ -78,8 +78,6 @@
 #define RMB		__asm__ __volatile__ ("sync")
 #endif

-#define INLINE inline
-
 #ifdef PPC440
 #define STDERR stdout
 #define QNONCACHE 0x1
@ -91,7 +89,7 @@

 void *qalloc(int flags, size_t bytes);

-static INLINE void blas_lock(volatile unsigned long *address){
+static inline void blas_lock(volatile unsigned long *address){

  long int ret, val = 1;

@ -841,17 +839,17 @@ Lmcount$lazy_ptr:
 #endif

 #if defined(PPC440)
-#define BUFFER_SIZE     (  2 << 20)
+#define BUFFER_SIZE     (  2UL << 20)
 #elif defined(PPC440FP2)
-#define BUFFER_SIZE     ( 16 << 20)
+#define BUFFER_SIZE     ( 16UL << 20)
 #elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
-#define BUFFER_SIZE     ( 64 << 22)
+#define BUFFER_SIZE     ( 64UL << 22)
 #else
-#define BUFFER_SIZE     ( 16 << 20)
+#define BUFFER_SIZE     ( 16UL << 20)
 #endif
 #ifdef DYNAMIC_ARCH
 #undef BUFFER_SIZE
-#define BUFFER_SIZE (64 << 22)
+#define BUFFER_SIZE (64UL << 22)
 #endif

 #ifndef PAGESIZE
--- a/common_riscv64.h
+++ b/common_riscv64.h
@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

-#define INLINE inline
-
 #ifndef ASSEMBLER


--- a/common_thread.h
+++ b/common_thread.h
@ -111,8 +111,8 @@ typedef struct blas_queue {
  struct blas_queue *next;

 #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
-  // CRITICAL_SECTION lock;
-  // HANDLE finish;
+   CRITICAL_SECTION lock;
+   HANDLE finish;
  volatile int finished;
 #else
  pthread_mutex_t	 lock;
--- a/common_x86_64.h
+++ b/common_x86_64.h
@ -253,7 +253,7 @@ static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){
 #ifndef BUFFERSIZE
 #define BUFFER_SIZE	(32 << 22)
 #else
-#define BUFFER_SIZE	(32 << BUFFERSIZE)
+#define BUFFER_SIZE	(32UL << BUFFERSIZE)
 #endif

 #define SEEK_ADDRESS
--- a/common_zarch.h
+++ b/common_zarch.h
@ -37,9 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB  
 #define RMB

-
-#define INLINE inline
-
 #define RETURN_BY_COMPLEX

 #ifndef ASSEMBLER
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@ -46,6 +46,7 @@ size_t length64=sizeof(value64);
 #define CPU_NEOVERSEN1    11
 #define CPU_NEOVERSEV1    16
 #define CPU_NEOVERSEN2    17
+#define CPU_NEOVERSEV2   24
 #define CPU_CORTEXX1      18
 #define CPU_CORTEXX2	  19
 #define CPU_CORTEXA510	  20
@ -91,7 +92,8 @@ static char *cpuname[] = {
  "CORTEXA510",
  "CORTEXA710",
  "FT2000",
-  "CORTEXA76"
+  "CORTEXA76",
+	"NEOVERSEV2"
 };

 static char *cpuname_lower[] = {
@ -118,7 +120,8 @@ static char *cpuname_lower[] = {
  "cortexa510",
  "cortexa710",
  "ft2000",
-  "cortexa76"
+  "cortexa76",
+	"neoversev2"
 };

 int get_feature(char *search)
@ -213,6 +216,8 @@ int detect(void)
 	return CPU_CORTEXX2;
      else if (strstr(cpu_part, "0xd4e")) //X3
 	return CPU_CORTEXX2;
+      else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al.
+        return CPU_NEOVERSEV2;
      else if (strstr(cpu_part, "0xd0b")) 
 	return CPU_CORTEXA76;
    }
@ -396,6 +401,7 @@ void get_cpuconfig(void)
 		break;

 	    case CPU_NEOVERSEV1:
+		printf("#define HAVE_SVE 1\n");
 	    case CPU_CORTEXA76:
                printf("#define %s\n", cpuname[d]);
                printf("#define L1_CODE_SIZE 65536\n");
@ -424,12 +430,32 @@ void get_cpuconfig(void)
                printf("#define L2_ASSOCIATIVE 8\n");
                printf("#define DTB_DEFAULT_ENTRIES 48\n");
                printf("#define DTB_SIZE 4096\n");
+		printf("#define HAVE_SVE 1\n");
+                break;
+      case CPU_NEOVERSEV2:
+                printf("#define ARMV9\n");
+	        printf("#define HAVE_SVE 1\n");
+                printf("#define %s\n", cpuname[d]);
+                printf("#define L1_CODE_SIZE 65536\n");
+                printf("#define L1_CODE_LINESIZE 64\n");
+                printf("#define L1_CODE_ASSOCIATIVE 4\n");
+                printf("#define L1_DATA_SIZE 65536\n");
+                printf("#define L1_DATA_LINESIZE 64\n");
+                printf("#define L1_DATA_ASSOCIATIVE 4\n");
+                printf("#define L2_SIZE 1048576\n");
+                printf("#define L2_LINESIZE 64\n");
+                printf("#define L2_ASSOCIATIVE 8\n");
+								// L1 Data TLB = 48 entries
+								// L2 Data TLB = 2048 entries
+                printf("#define DTB_DEFAULT_ENTRIES 48\n");
+                printf("#define DTB_SIZE 4096\n");  // Set to 4096 for symmetry with other configs.
                break;
 	    case CPU_CORTEXA510:
 	    case CPU_CORTEXA710:
 	    case CPU_CORTEXX1:
 	    case CPU_CORTEXX2:
 		printf("#define ARMV9\n");
+		printf("#define HAVE_SVE 1\n");
                printf("#define %s\n", cpuname[d]);
                printf("#define L1_CODE_SIZE 65536\n");
                printf("#define L1_CODE_LINESIZE 64\n");
@ -546,6 +572,7 @@ void get_cpuconfig(void)
 		break;
 	    case CPU_A64FX:
 		printf("#define A64FX\n");
+		printf("#define HAVE_SVE 1\n");
    		printf("#define L1_CODE_SIZE 65535\n");
    		printf("#define L1_DATA_SIZE 65535\n");
    		printf("#define L1_DATA_LINESIZE 256\n");
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2020, The OpenBLAS Project
+Copyright (c) 2011-2024, The OpenBLAS Project
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
@ -32,52 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/

 #include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
 #include <sys/auxv.h>

-/*  If LASX extension instructions supported,
- *  using core LOONGSON3R5
- *  If only LSX extension instructions supported,
- *  using core LOONGSON2K1000
- *  If neither LASX nor LSX extension instructions supported,
- *  using core LOONGSONGENERIC (As far as I know, there is no such
- *  CPU yet)
- */
+#define CPU_LA64_GENERIC     0
+#define CPU_LA264            1
+#define CPU_LA364            2
+#define CPU_LA464            3
+#define CPU_LA664            4

-#define CPU_GENERIC        0
-#define CPU_LOONGSON3R5    1
-#define CPU_LOONGSON2K1000 2
+#define CORE_LA64_GENERIC    0
+#define CORE_LA264           1
+#define CORE_LA464           2

 #define LA_HWCAP_LSX    (1U << 4)
 #define LA_HWCAP_LASX   (1U << 5)

+#define LOONGARCH_CFG0      0x00
+#define LOONGARCH_CFG2      0x02
+#define LOONGARCH_CFG10     0x10
+#define LOONGARCH_CFG11     0x11
+#define LOONGARCH_CFG12     0x12
+#define LOONGARCH_CFG13     0x13
+#define LOONGARCH_CFG14     0x14
+#define LASX_MASK           1<<7
+#define LSX_MASK            1<<6
+#define PRID_SERIES_MASK    0xf000
+#define PRID_SERIES_LA264   0xa000
+#define PRID_SERIES_LA364   0xb000
+#define PRID_SERIES_LA464   0xc000
+#define PRID_SERIES_LA664   0xd000
+
+#define CACHE_INFO_L1_IU    0
+#define CACHE_INFO_L1_D     1
+#define CACHE_INFO_L2_IU    2
+#define CACHE_INFO_L2_D     3
+#define CACHE_INFO_L3_IU    4
+#define CACHE_INFO_L3_D     5
+#define L1_IU_PRESENT_MASK  0x0001
+#define L1_IU_UNITY_MASK    0x0002
+#define L1_D_PRESENT_MASK   0x0004
+#define L2_IU_PRESENT_MASK  0x0008
+#define L2_IU_UNITY_MASK    0x0010
+#define L2_D_PRESENT_MASK   0x0080
+#define L3_IU_PRESENT_MASK  0x0400
+#define L3_IU_UNITY_MASK    0x0800
+#define L3_D_PRESENT_MASK   0x4000
+#define CACHE_WAY_MINUS_1_MASK      0x0000ffff
+#define CACHE_INDEX_LOG2_MASK       0x00ff0000
+#define CACHE_LINESIZE_LOG2_MASK    0x7f000000
+
+typedef struct {
+  int size;
+  int associative;
+  int linesize;
+  int unify;
+  int present;
+} cache_info_t;
+
+/* Using microarchitecture representation */
 static char *cpuname[] = {
-  "LOONGSONGENERIC",
-  "LOONGSON3R5",
-  "LOONGSON2K1000"
+  "LA64_GENERIC",
+  "LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */
+  "LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */
+  "LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */
+  "LA664"  /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */
 };

 static char *cpuname_lower[] = {
-  "loongsongeneric",
-  "loongson3r5",
-  "loongson2k1000"
+  "la64_generic",
+  "la264",
+  "la364",
+  "la464",
+  "la664"
 };

-int detect(void) {
-#ifdef __linux
+static char *corename[] = {
+  "LA64_GENERIC", /* Implies using scalar instructions for optimization */
+  "LA264", /* Implies using LSX  instructions for optimization */
+  "LA464", /* Implies using LASX instructions for optimization */
+};
+
+static char *corename_lower[] = {
+  "la64_generic",
+  "la264",
+  "la464",
+};
+
+/*
+ * Obtain cache and processor identification
+ * through the cpucfg command.
+ */
+static void get_cacheinfo(int type, cache_info_t *cacheinfo) {
+  cache_info_t cache_info;
+  memset(&cache_info, 0, sizeof(cache_info));
+  uint32_t reg_10 = 0;
+  __asm__ volatile (
+    "cpucfg %0, %1 \n\t"
+    : "+&r"(reg_10)
+    : "r"(LOONGARCH_CFG10)
+  );
+
+  switch (type) {
+    case CACHE_INFO_L1_IU:
+      if (reg_10 & L1_IU_PRESENT_MASK) {
+        uint32_t reg_11 = 0;
+        cache_info.present = reg_10 & L1_IU_PRESENT_MASK;
+        cache_info.unify   = reg_10 & L1_IU_UNITY_MASK;
+        __asm__ volatile (
+          "cpucfg %0, %1 \n\t"
+          : "+&r"(reg_11)
+          : "r"(LOONGARCH_CFG11)
+        );
+        cache_info.associative  = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1;
+        cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24);
+        cache_info.size = cache_info.associative * cache_info.linesize *
+                          (1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16));
+      }
+    break;
+
+    case CACHE_INFO_L1_D:
+      if (reg_10 & L1_D_PRESENT_MASK) {
+        uint32_t reg_12 = 0;
+        cache_info.present = reg_10 & L1_D_PRESENT_MASK;
+        __asm__ volatile (
+          "cpucfg %0, %1 \n\t"
+          : "+&r"(reg_12)
+          : "r"(LOONGARCH_CFG12)
+        );
+        cache_info.associative  = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1;
+        cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24);
+        cache_info.size = cache_info.associative * cache_info.linesize *
+                          (1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16));
+      }
+    break;
+
+    case CACHE_INFO_L2_IU:
+      if (reg_10 & L2_IU_PRESENT_MASK) {
+        uint32_t reg_13 = 0;
+        cache_info.present = reg_10 & L2_IU_PRESENT_MASK;
+        cache_info.unify   = reg_10 & L2_IU_UNITY_MASK;
+        __asm__ volatile (
+          "cpucfg %0, %1 \n\t"
+          : "+&r"(reg_13)
+          : "r"(LOONGARCH_CFG13)
+        );
+        cache_info.associative  = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1;
+        cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24);
+        cache_info.size = cache_info.associative * cache_info.linesize *
+                          (1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16));
+      }
+    break;
+
+    case CACHE_INFO_L2_D:
+      if (reg_10 & L2_D_PRESENT_MASK) {
+        cache_info.present = reg_10 & L2_D_PRESENT_MASK;
+        // No date fetch
+      }
+    break;
+
+    case CACHE_INFO_L3_IU:
+      if (reg_10 & L3_IU_PRESENT_MASK) {
+        uint32_t reg_14 = 0;
+        cache_info.present = reg_10 & L3_IU_PRESENT_MASK;
+        cache_info.unify   = reg_10 & L3_IU_UNITY_MASK;
+        __asm__ volatile (
+          "cpucfg %0, %1 \n\t"
+          : "+&r"(reg_14)
+          : "r"(LOONGARCH_CFG14)
+        );
+        cache_info.associative  = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1;
+        cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24);
+        cache_info.size = cache_info.associative * cache_info.linesize *
+                          (1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16));
+      }
+    break;
+
+    case CACHE_INFO_L3_D:
+      if (reg_10 & L3_D_PRESENT_MASK) {
+        cache_info.present = reg_10 & L3_D_PRESENT_MASK;
+        // No data fetch
+      }
+    break;
+
+    default:
+    break;
+  }
+  *cacheinfo = cache_info;
+}
+
+static uint32_t get_prid() {
+  uint32_t reg = 0;
+  __asm__ volatile (
+    "cpucfg %0, %1 \n\t"
+    : "+&r"(reg)
+    : "r"(LOONGARCH_CFG0)
+  );
+  return reg;
+}
+
+static void get_cpucount(uint32_t *count) {
+  uint32_t num = 0;
+  FILE *f = fopen("/proc/cpuinfo", "r");
+  if (!f) return;
+  char buf[200];
+  while (fgets(buf, sizeof(buf), f))
+  {
+    if (!strncmp("processor", buf, 9))
+      num ++;
+  }
+  fclose(f);
+  *count = num;
+}
+
+/* Detect whether the OS supports the LASX instruction set */
+static int os_support_lasx() {
  int hwcap  = (int)getauxval(AT_HWCAP);

  if (hwcap & LA_HWCAP_LASX)
-    return CPU_LOONGSON3R5;
-  else if (hwcap & LA_HWCAP_LSX)
-    return CPU_LOONGSON2K1000;
+    return 1;
  else
-    return CPU_GENERIC;
-#endif
-  return CPU_GENERIC;
+    return 0;
+}
+
+/* Detect whether the OS supports the LSX instruction set */
+static int os_support_lsx() {
+  int hwcap  = (int)getauxval(AT_HWCAP);
+
+  if (hwcap & LA_HWCAP_LSX)
+    return 1;
+  else
+    return 0;
+}
+
+int get_coretype(void) {
+  uint32_t prid = get_prid();
+  switch (prid & PRID_SERIES_MASK) {
+    case (PRID_SERIES_LA464):
+    case (PRID_SERIES_LA664):
+      if (os_support_lasx())
+        return CORE_LA464;
+      else if (os_support_lsx())
+        return CORE_LA264;
+      else
+        return CORE_LA64_GENERIC;
+    break;
+
+    case (PRID_SERIES_LA264):
+    case (PRID_SERIES_LA364):
+      if (os_support_lsx())
+        return CORE_LA264;
+      else
+        return CORE_LA64_GENERIC;
+    break;
+
+    default:
+      return CORE_LA64_GENERIC;
+    break;
+  }
+}
+
+int get_cputype(void) {
+  uint32_t prid = get_prid();
+  switch (prid & PRID_SERIES_MASK) {
+    case (PRID_SERIES_LA264):
+      return CPU_LA264;
+    break;
+
+    case (PRID_SERIES_LA364):
+      return CPU_LA364;
+    break;
+
+    case (PRID_SERIES_LA464):
+      return CPU_LA464;
+    break;
+
+    case (PRID_SERIES_LA664):
+      return CPU_LA664;
+    break;
+
+    default:
+      return CPU_LA64_GENERIC;
+    break;
+  }
 }

 char *get_corename(void) {
-  return cpuname[detect()];
+  return corename[get_coretype()];
+}
+
+void get_libname(void){
+  printf("%s", corename_lower[get_coretype()]);
 }

 void get_architecture(void) {
@ -85,8 +332,7 @@ void get_architecture(void) {
 }

 void get_subarchitecture(void) {
-  int d = detect();
-  printf("%s", cpuname[d]);
+  printf("%s", cpuname[get_cputype()]);
 }

 void get_subdirname(void) {
@ -94,50 +340,69 @@ void get_subdirname(void) {
 }

 void get_cpuconfig(void) {
-  uint32_t hwcaps = 0;
-  int d = detect();
+  cache_info_t info;
+  uint32_t num_cores = 0;

-  switch (d) {
-    case CPU_LOONGSON3R5:
-      printf("#define LOONGSON3R5\n");
-      printf("#define L1_DATA_SIZE 65536\n");
-      printf("#define L1_DATA_LINESIZE 64\n");
-      printf("#define L2_SIZE 1048576\n");
-      printf("#define L2_LINESIZE 64\n");
-      printf("#define DTB_DEFAULT_ENTRIES 64\n");
-      printf("#define DTB_SIZE 4096\n");
-      printf("#define L2_ASSOCIATIVE 16\n");
-    break;
+  printf("#define %s\n", corename[get_coretype()]); // Core name

-    case CPU_LOONGSON2K1000:
-      printf("#define LOONGSON2K1000\n");
-      printf("#define L1_DATA_SIZE 65536\n");
-      printf("#define L1_DATA_LINESIZE 64\n");
-      printf("#define L2_SIZE 262144\n");
-      printf("#define L2_LINESIZE 64\n");
-      printf("#define DTB_DEFAULT_ENTRIES 64\n");
-      printf("#define DTB_SIZE 4096\n");
-      printf("#define L2_ASSOCIATIVE 16\n");
-    break;
+  printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name

-    default:
-      printf("#define LOONGSONGENERIC\n");
-      printf("#define L1_DATA_SIZE 65536\n");
-      printf("#define L1_DATA_LINESIZE 64\n");
-      printf("#define L2_SIZE 262144\n");
-      printf("#define L2_LINESIZE 64\n");
-      printf("#define DTB_DEFAULT_ENTRIES 64\n");
-      printf("#define DTB_SIZE 4096\n");
-      printf("#define L2_ASSOCIATIVE 16\n");
-    break;
+  get_cacheinfo(CACHE_INFO_L1_IU, &info);
+  if (info.present) {
+    if (info.unify) { // Unified cache, without distinguishing between instructions and data
+      printf("#define L1_SIZE %d\n", info.size);
+      printf("#define L1_ASSOCIATIVE %d\n", info.associative);
+      printf("#define L1_LINESIZE %d\n", info.linesize);
+    } else {
+      printf("#define L1_CODE_SIZE %d\n", info.size);
+      printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative);
+      printf("#define L1_CODE_LINESIZE %d\n", info.linesize);
+    }
  }

-  hwcaps = (uint32_t)getauxval( AT_HWCAP );
-  if (hwcaps & LA_HWCAP_LSX)      printf("#define HAVE_LSX\n");
-  if (hwcaps & LA_HWCAP_LASX)     printf("#define HAVE_LASX\n");
-}
+  if (!info.unify) {
+    get_cacheinfo(CACHE_INFO_L1_D, &info);
+    if (info.present) {
+      printf("#define L1_DATA_SIZE %d\n", info.size);
+      printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative);
+      printf("#define L1_DATA_LINESIZE %d\n", info.linesize);
+    }
+  }

-void get_libname(void){
-  int d = detect();
-  printf("%s", cpuname_lower[d]);
+  get_cacheinfo(CACHE_INFO_L2_IU, &info);
+  if (info.present > 0) {
+    if (info.unify) {
+      printf("#define L2_SIZE %d\n", info.size);
+      printf("#define L2_ASSOCIATIVE %d\n", info.associative);
+      printf("#define L2_LINESIZE %d\n", info.linesize);
+    } else {
+      printf("#define L2_CODE_SIZE %d\n", info.size);
+      printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative);
+      printf("#define L2_CODE_LINESIZE %d\n", info.linesize);
+    }
+  }
+
+  get_cacheinfo(CACHE_INFO_L3_IU, &info);
+  if (info.present > 0) {
+    if (info.unify) {
+      printf("#define L3_SIZE %d\n", info.size);
+      printf("#define L3_ASSOCIATIVE %d\n", info.associative);
+      printf("#define L3_LINESIZE %d\n", info.linesize);
+    } else {
+      printf("#define L3_CODE_SIZE %d\n", info.size);
+      printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative);
+      printf("#define L3_CODE_LINESIZE %d\n", info.linesize);
+    }
+  }
+
+  if(os_support_lsx)  printf("#define HAVE_LSX\n");
+  if(os_support_lasx) printf("#define HAVE_LASX\n");
+
+  get_cpucount(&num_cores);
+  if (num_cores)
+    printf("#define NUM_CORES %d\n", num_cores);
+
+  //TODO: It’s unclear what this entry represents, but it is indeed necessary.
+  //It has been set based on reference to other platforms.
+  printf("#define DTB_DEFAULT_ENTRIES 64\n");
 }
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@ -1527,14 +1527,29 @@ int get_cpuname(void){
      break;
      case 10: //family 6 exmodel 10
        switch (model) {
+	  case 13: // Granite Rapids
+	  if(support_amx_bf16())
+	    return CPUTYPE_SAPPHIRERAPIDS;
+	  if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;	
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+            return CPUTYPE_SANDYBRIDGE;
+          else
+          return CPUTYPE_NEHALEM;
          case 5: // Comet Lake H and S
          case 6: // Comet Lake U
+	  case 10: // Meteor Lake
            if(support_avx2())
              return CPUTYPE_HASWELL;
            if(support_avx())
              return CPUTYPE_SANDYBRIDGE;
            else
              return CPUTYPE_NEHALEM;
+	  case 0: // Meteor Lake
          case 7: // Rocket Lake           
 	    if(support_avx512())
              return CPUTYPE_SKYLAKEX;
@ -1560,6 +1575,19 @@ int get_cpuname(void){
 	      return CPUTYPE_NEHALEM;
        }
        break;
+      case 12: //family 6 exmodel 12
+	switch (model) {
+	  case 15:
+	    if(support_avx512())
+              return CPUTYPE_SAPPHIRERAPIDS;
+            if(support_avx2())
+              return CPUTYPE_HASWELL;
+            if(support_avx())
+	      return CPUTYPE_SANDYBRIDGE;
+	    else
+	    return CPUTYPE_NEHALEM;
+	  }
+	break;
      }
      break;    
    case 0x7:
@ -1661,6 +1689,7 @@ int get_cpuname(void){
 	    return CPUTYPE_BARCELONA;
        }
      case 10: // Zen3/4
+      case 11: // Zen5
 #ifndef NO_AVX512
          if(support_avx512_bf16())
            return CPUTYPE_COOPERLAKE;
@ -2337,8 +2366,22 @@ int get_coretype(void){

      case 10:
        switch (model) {
+	  case 13: // Granite Rapids
+	  if(support_amx_bf16())
+	    return CORE_SAPPHIRERAPIDS;
+	  if(support_avx512_bf16())
+            return CORE_COOPERLAKE;	
+          if(support_avx512())
+            return CORE_SKYLAKEX;
+          if(support_avx2())
+            return CORE_HASWELL;
+          if(support_avx())
+	    return CORE_SANDYBRIDGE;
+	  else
+	    return CORE_NEHALEM;
 	  case 5: // Comet Lake H and S
    	  case 6: // Comet Lake U
+	  case 10: // Meteor Lake
            if(support_avx())
  #ifndef NO_AVX2
              return CORE_HASWELL;
@ -2347,6 +2390,7 @@ int get_coretype(void){
  #endif
            else
              return CORE_NEHALEM;
+	  case 0: // Meteor Lake
 	  case 7:// Rocket Lake
 #ifndef NO_AVX512
 	  if(support_avx512())
@ -2377,10 +2421,10 @@ int get_coretype(void){
 	  else
 	  return CORE_NEHALEM;
 	}
-      case 15:
-	if (model <= 0x2) return CORE_NORTHWOOD;
-	else return CORE_PRESCOTT;
      }
+    case 15:
+      if (model <= 0x2) return CORE_NORTHWOOD;
+      else return CORE_PRESCOTT;
    }
  }

@ -2436,7 +2480,7 @@ int get_coretype(void){
 	  }
 	  break;
 	}
-      } else if (exfamily == 8 || exfamily == 10) {
+      } else if (exfamily == 8 || exfamily == 10 || exfamily == 11) {
 	switch (model) {
 	case 1:
 	  // AMD Ryzen
@ -2511,6 +2555,7 @@ int get_coretype(void){
      case 0x7:
        switch (exmodel) {
        case 5:
+	case 6:
          if (support_avx2())
            return CORE_ZEN;
          else
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@ -6,6 +6,10 @@ enable_language(Fortran)
 endif()

 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
+if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2)
+       list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os)
+       set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE)
+endif()
 if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
 	set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
 endif()
--- a/ctest/Makefile
+++ b/ctest/Makefile
@ -25,6 +25,9 @@ endif

 override CFLAGS += -DADD$(BU) -DCBLAS
 ifeq ($(F_COMPILER),GFORTRAN)
+ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
+	override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0
+endif
 	override FFLAGS += -fno-tree-vectorize
 endif
 override TARGET_ARCH=
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@ -38,9 +38,12 @@
            CALL CHECK1(SFAC)
         END IF
 *        -- Print
-         IF (PASS) WRITE (NOUT,99998)
+         IF (PASS) THEN
+            WRITE (NOUT,99998)
+         ELSE
+            CALL ABORT
+        END IF
   20 CONTINUE
-      STOP
 *
 99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@ -228,7 +231,7 @@
               CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
-               STOP
+               CALL ABORT
            END IF
 *
   40    CONTINUE
@ -512,7 +515,7 @@
               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
-               STOP
+               CALL ABORT
            END IF
 *
   40    CONTINUE
--- a/ctest/c_cblat2.f
+++ b/ctest/c_cblat2.f
@ -10,7 +10,7 @@
 *  'CBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -243,7 +243,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
-      STOP
+      CALL ABORT
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@ -283,7 +283,7 @@
      SAME = LCE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANS = 'T'
      CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -291,7 +291,7 @@
      SAME = LCE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -418,7 +418,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_cblat3.f
+++ b/ctest/c_cblat3.f
@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -194,7 +194,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
-      STOP
+      CALL ABORT
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@ -237,7 +237,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -246,7 +246,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@ -264,7 +264,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -273,7 +273,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -385,7 +385,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_cblat3_3m.f
+++ b/ctest/c_cblat3_3m.f
@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -194,7 +194,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
-      STOP
+      CALL ABORT
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@ -237,7 +237,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -246,7 +246,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@ -264,7 +264,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -273,7 +273,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -385,7 +385,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_dblat1.f
+++ b/ctest/c_dblat1.f
@ -44,9 +44,12 @@
            CALL CHECK3(SFAC)
         END IF
 *        -- Print
-         IF (PASS) WRITE (NOUT,99998)
+         IF (PASS) THEN
+            WRITE (NOUT,99998)
+         ELSE
+            CALL ABORT
+        END IF
   20 CONTINUE
-      STOP
 *
 99999 FORMAT (' Real CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@ -136,7 +139,7 @@
            CALL STEST1(SS,DS1(K),DS1(K),SFAC)
         ELSE
            WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
-            STOP
+            CALL ABORT
         END IF
   20 CONTINUE
   40 RETURN
@ -229,7 +232,7 @@
               CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
-               STOP
+               CALL ABORT
            END IF
   60    CONTINUE
   80 CONTINUE
@ -384,7 +387,7 @@
               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
-               STOP
+               CALL ABORT
            END IF
  100    CONTINUE
  120 CONTINUE
@ -472,7 +475,7 @@
   70          CONTINUE      
               ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
-               STOP
+               CALL ABORT
            END IF
   40    CONTINUE
   60 CONTINUE
--- a/ctest/c_dblat2.f
+++ b/ctest/c_dblat2.f
@ -10,7 +10,7 @@
 *  'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -239,7 +239,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
-      STOP
+      CALL ABORT
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@ -279,7 +279,7 @@
      SAME = LDE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANS = 'T'
      CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -287,7 +287,7 @@
      SAME = LDE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -414,7 +414,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_dblat3.f
+++ b/ctest/c_dblat3.f
@ -10,7 +10,7 @@
 *  'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -189,7 +189,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
-      STOP
+      CALL ABORT
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@ -232,7 +232,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'T'
      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -241,7 +241,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@ -259,7 +259,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'T'
      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -268,7 +268,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -379,7 +379,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_sblat1.f
+++ b/ctest/c_sblat1.f
@ -44,9 +44,12 @@
            CALL CHECK3(SFAC)
         END IF
 *        -- Print
-         IF (PASS) WRITE (NOUT,99998)
+         IF (PASS) THEN
+            WRITE (NOUT,99998)
+         ELSE
+            CALL ABORT
+        END IF
   20 CONTINUE
-      STOP
 *
 99999 FORMAT (' Real CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@ -136,7 +139,7 @@
            CALL STEST1(SS,DS1(K),DS1(K),SFAC)
         ELSE
            WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
-            STOP
+            CALL ABORT
         END IF
   20 CONTINUE
   40 RETURN
@ -229,7 +232,7 @@
               CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
-               STOP
+               CALL ABORT
            END IF
   60    CONTINUE
   80 CONTINUE
@ -384,7 +387,7 @@
               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
-               STOP
+               CALL ABORT
            END IF
  100    CONTINUE
  120 CONTINUE
@ -479,7 +482,7 @@
   70          CONTINUE
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
-               STOP
+               CALL ABORT
            END IF
   40    CONTINUE
   60 CONTINUE
@ -759,4 +762,4 @@
         END IF
      END IF
      RETURN
-      END
+      END
--- a/ctest/c_sblat2.f
+++ b/ctest/c_sblat2.f
@ -10,7 +10,7 @@
 *  'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -239,7 +239,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
-      STOP
+      CALL ABORT
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@ -279,7 +279,7 @@
      SAME = LSE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANS = 'T'
      CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -287,7 +287,7 @@
      SAME = LSE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -414,7 +414,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_sblat3.f
+++ b/ctest/c_sblat3.f
@ -10,7 +10,7 @@
 *  'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -188,7 +188,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
-      STOP
+      CALL ABORT
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@ -231,7 +231,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'T'
      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -240,7 +240,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@ -258,7 +258,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'T'
      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -267,7 +267,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -378,7 +378,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_zblat1.f
+++ b/ctest/c_zblat1.f
@ -38,9 +38,12 @@
            CALL CHECK1(SFAC)
         END IF
 *        -- Print
-         IF (PASS) WRITE (NOUT,99998)
+         IF (PASS) THEN
+            WRITE (NOUT,99998)
+         ELSE
+            CALL ABORT
+        END IF
   20 CONTINUE
-      STOP
 *
 99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@ -228,7 +231,7 @@
               CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
-               STOP
+               CALL ABORT
            END IF
 *
   40    CONTINUE
@ -512,7 +515,7 @@
               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
-               STOP
+               CALL ABORT
            END IF
 *
   40    CONTINUE
--- a/ctest/c_zblat1c.c
+++ b/ctest/c_zblat1c.c
@ -380,7 +380,7 @@ static doublereal c_b43 = 1.;
    static integer i__;
    extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
    static doublecomplex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int zscaltest_(integer*, doublereal*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
+    extern /* Subroutine */ int zscaltest_(integer*, doublecomplex*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
    static doublecomplex cx[8];
    extern doublereal dznrm2test_(integer*, doublecomplex*, integer*);
    static integer np1;
@ -595,7 +595,7 @@ static doublereal c_b43 = 1.;
    static integer ki;
    extern /* Subroutine */ int zdotutest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zswaptest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*);
    static integer kn;
-    extern /* Subroutine */ int zaxpytest_(integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*);
+    extern /* Subroutine */ int zaxpytest_(integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*);
    static doublecomplex cx[7], cy[7];
    static integer mx, my;

--- a/ctest/c_zblat2.f
+++ b/ctest/c_zblat2.f
@ -10,7 +10,7 @@
 *  'CBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -243,7 +243,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
-      STOP
+      CALL ABORT
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@ -283,7 +283,7 @@
      SAME = LZE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANS = 'T'
      CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -291,7 +291,7 @@
      SAME = LZE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -418,7 +418,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_zblat3.f
+++ b/ctest/c_zblat3.f
@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -195,7 +195,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
-      STOP
+      CALL ABORT
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@ -238,7 +238,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -247,7 +247,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@ -265,7 +265,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -274,7 +274,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -386,7 +386,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_zblat3_3m.f
+++ b/ctest/c_zblat3_3m.f
@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*  F        LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@ -195,7 +195,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
-      STOP
+      CALL ABORT
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@ -238,7 +238,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -247,7 +247,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@ -265,7 +265,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -274,7 +274,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
-         STOP
+         CALL ABORT
      END IF
 *
 *     Test each subroutine in turn.
@ -386,7 +386,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
-      STOP
+      IF( FATAL ) THEN
+         CALL ABORT
+      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/docs/about.md
+++ b/docs/about.md
@ -2,25 +2,45 @@

 We have a [GitHub discussions](https://github.com/OpenMathLib/OpenBLAS/discussions/) forum to discuss usage and development of OpenBLAS. We also have a [Google group for *users*](https://groups.google.com/forum/#!forum/openblas-users) and a [Google group for *development of*](https://groups.google.com/forum/#!forum/openblas-dev) OpenBLAS. 

-## Donations
-
-You can read OpenBLAS statement of receipts and disbursement and cash balance on [google doc](https://docs.google.com/spreadsheet/ccc?key=0AghkTjXe2lDndE1UZml0dGpaUzJmZGhvenBZd1F2R1E&usp=sharing). A backer list is available [on GitHub](https://github.com/OpenMathLib/OpenBLAS/blob/develop/BACKERS.md).
-
-We welcome the hardware donation, including the latest CPU and boards.
-
 ## Acknowledgements

-This work is partially supported by
+This work was or is partially supported by the following grants, contracts and institutions:
+
 * Research and Development of Compiler System and Toolchain for Domestic CPU, National S&T Major Projects: Core Electronic Devices, High-end General Chips and Fundamental Software (No.2009ZX01036-001-002)
 * National High-tech R&D Program of China (Grant No.2012AA010903)
+* [PerfXLab](http://www.perfxlab.com/)
+* Chan Zuckerberg Initiative's Essential Open Source Software for Science program:
+    * Cycle 1 grant: [Strengthening NumPy's foundations - growing beyond code](https://figshare.com/articles/journal_contribution/Proposal_NumPy_OpenBLAS_for_Chan_Zuckerberg_Initiative_EOSS_2019_round_1/10302167) (2019-2020)
+    * Cycle 3 grant: [Improving usability and sustainability for NumPy and OpenBLAS](https://chanzuckerberg.com/eoss/proposals/improving-usability-and-sustainability-for-numpy-and-openblas/) (2020-2021)
+* Sovereign Tech Fund funding: [Keeping high performance linear algebra computation accessible and open for all](https://www.sovereigntechfund.de/tech/openblas) (2023-2024)

-## Users of OpenBLAS
+Over the course of OpenBLAS development, a number of donations were received.
+You can read OpenBLAS's statement of receipts and disbursement and cash balance in
+[this Google doc](https://docs.google.com/spreadsheet/ccc?key=0AghkTjXe2lDndE1UZml0dGpaUzJmZGhvenBZd1F2R1E&usp=sharing) (covers 2013-2016).
+A list of backers is available [in BACKERS.md](https://github.com/OpenMathLib/OpenBLAS/blob/develop/BACKERS.md) in the main repo.

-* <a href='http://julialang.org/'>Julia - a high-level, high-performance dynamic programming language for technical computing</a><br />
-* Ceemple v1.0.3 (C++ technical computing environment), including OpenBLAS, Qt, Boost, OpenCV and others. The only solution with immediate-recompilation of C++ code. Available from <a href='http://www.ceemple.com'>Ceemple C++ Technical Computing</a>.
-* [netlib-java](https://github.com/fommil/netlib-java) and various upstream libraries, allowing OpenBLAS to be used from languages on the Java Virtual Machine.
+### Donations
+
+We welcome hardware donations, including the latest CPUs and motherboards.
+
+
+## Open source users of OpenBLAS
+
+Prominent open source users of OpenBLAS include:
+
+* [Julia](https://julialang.org) - a high-level, high-performance dynamic programming language for technical computing
+* [NumPy](https://numpy.org) - the fundamental package for scientific computing with Python
+* [SciPy](https://scipy.org) - fundamental algorithms for scientific computing in Python
+* [R](https://www.r-project.org/) - a free software environment for statistical computing and graphics
+* [OpenCV](https://opencv.org/) - the world's biggest computer vision library
+
+OpenBLAS is packaged in most major Linux distros, as well as general and
+numerical computing-focused packaging ecosystems like Nix, Homebrew, Spack and
+conda-forge.
+
+OpenBLAS is used directly by libraries written in C, C++ and Fortran (and
+probably other languages), and directly by end users in those languages.

-<!-- TODO: academia users, industry users, hpc centers deployed openblas, etc. -->

 ## Publications

--- a/docs/build_system.md
+++ b/docs/build_system.md
@ -1,3 +1,8 @@
+This page describes the Make-based build, which is the default/authoritative
+build method. Note that the OpenBLAS repository also supports building with
+CMake (not described here) - that generally works and is tested, however there
+may be small differences between the Make and CMake builds.
+
 !!! warning
    This page is made by someone who is not the developer and should not be considered as an official documentation of the build system. For getting the full picture, it is best to read the Makefiles and understand them yourself.

@ -95,10 +100,21 @@ NUM_PARALLEL    - define this to the number of OpenMP instances that your code m
 ```


-OpenBLAS uses a fixed set of memory buffers internally, used for communicating and compiling partial results from individual threads.
-For efficiency, the management array structure for these buffers is sized at build time - this makes it necessary to know in advance how
-many threads need to be supported on the target system(s). 
-With OpenMP, there is an additional level of complexity as there may be calls originating from a parallel region in the calling program. If OpenBLAS gets called from a single parallel region, it runs single-threaded automatically to avoid overloading the system by fanning out its own set of threads. 
-In the case that an OpenMP program makes multiple calls from independent regions or instances in parallel, this default serialization is not
-sufficient as the additional caller(s) would compete for the original set of buffers already in use by the first call.
-So if multiple OpenMP runtimes call into OpenBLAS at the same time, then only one of them will be able to make progress while all the rest of them spin-wait for the one available buffer. Setting NUM_PARALLEL to the upper bound on the number of OpenMP runtimes that you can have in a process ensures that there are a sufficient number of buffer sets available
+OpenBLAS uses a fixed set of memory buffers internally, used for communicating
+and compiling partial results from individual threads. For efficiency, the
+management array structure for these buffers is sized at build time - this
+makes it necessary to know in advance how many threads need to be supported on
+the target system(s).
+
+With OpenMP, there is an additional level of complexity as there may be calls
+originating from a parallel region in the calling program. If OpenBLAS gets
+called from a single parallel region, it runs single-threaded automatically to
+avoid overloading the system by fanning out its own set of threads. In the case
+that an OpenMP program makes multiple calls from independent regions or
+instances in parallel, this default serialization is not sufficient as the
+additional caller(s) would compete for the original set of buffers already in
+use by the first call. So if multiple OpenMP runtimes call into OpenBLAS at the
+same time, then only one of them will be able to make progress while all the
+rest of them spin-wait for the one available buffer. Setting `NUM_PARALLEL` to
+the upper bound on the number of OpenMP runtimes that you can have in a process
+ensures that there are a sufficient number of buffer sets available.
--- a/docs/developers.md
+++ b/docs/developers.md
@ -1,6 +1,6 @@
 # Developer manual

-## Source codes Layout
+## Source code layout

 ```
 OpenBLAS/  
@ -51,8 +51,7 @@ OpenBLAS/

 ```

-A call tree for `dgemm` is as following.
-
+A call tree for `dgemm` looks as follows:
 ```
 interface/gemm.c
        │
@ -61,10 +60,9 @@ driver/level3/level3.c
 gemm assembly kernels at kernel/
 ```

-To find the kernel currently used for a particular supported cpu, please check the corresponding `kernel/$(ARCH)/KERNEL.$(CPU)` file.
-
-Here is an example for `kernel/x86_64/KERNEL.HASWELL`
+To find the kernel currently used for a particular supported CPU, please check the corresponding `kernel/$(ARCH)/KERNEL.$(CPU)` file.

+Here is an example for `kernel/x86_64/KERNEL.HASWELL`:
 ```
 ...
 DTRMMKERNEL    =  dtrmm_kernel_4x8_haswell.c
@ -73,71 +71,122 @@ DGEMMKERNEL    =  dgemm_kernel_4x8_haswell.S
 ```
 According to the above `KERNEL.HASWELL`, OpenBLAS Haswell dgemm kernel file is `dgemm_kernel_4x8_haswell.S`.

+
 ## Optimizing GEMM for a given hardware

-Read the Goto paper to understand the algorithm.
+!!! abstract "Read the Goto paper to understand the algorithm"

-Goto, Kazushige; van de Geijn, Robert A. (2008). ["Anatomy of High-Performance Matrix Multiplication"](http://delivery.acm.org/10.1145/1360000/1356053/a12-goto.pdf?ip=155.68.162.54&id=1356053&acc=ACTIVE%20SERVICE&key=A79D83B43E50B5B8%2EF070BBE7E45C3F17%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517932837_edfe766f1e295d9a7830812371e1d173). ACM Transactions on Mathematical Software 34 (3): Article 12
-(The above link is available only to ACM members, but this and many related papers is also available on the pages
-of van de Geijn's FLAME project, http://www.cs.utexas.edu/~flame/web/FLAMEPublications.html )
+    Goto, Kazushige; van de Geijn, Robert A. (2008).
+    ["Anatomy of High-Performance Matrix Multiplication"](http://delivery.acm.org/10.1145/1360000/1356053/a12-goto.pdf?ip=155.68.162.54&id=1356053&acc=ACTIVE%20SERVICE&key=A79D83B43E50B5B8%2EF070BBE7E45C3F17%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517932837_edfe766f1e295d9a7830812371e1d173).
+    ACM Transactions on Mathematical Software 34 (3): Article 12

-The `driver/level3/level3.c` is the implementation of Goto's algorithm. Meanwhile, you can look at `kernel/generic/gemmkernel_2x2.c`, which is a naive `2x2` register blocking gemm kernel in C.
+    (The above link is available only to ACM members, but this and many related
+    papers is also available on [the pages of van de Geijn's FLAME project](http://www.cs.utexas.edu/~flame/web/FLAMEPublications.html))

-Then,
-* Write optimized assembly kernels. consider instruction pipeline, available registers, memory/cache accessing
-* Tuning cache block size, `Mc`, `Kc`, and `Nc` 
+The `driver/level3/level3.c` is the implementation of Goto's algorithm.
+Meanwhile, you can look at `kernel/generic/gemmkernel_2x2.c`, which is a naive
+`2x2` register blocking `gemm` kernel in C. Then:

-Note that not all of the cpu-specific parameters in param.h are actively used in algorithms. DNUMOPT only appears as a scale factor in profiling output of the level3 syrk interface code, while its counterpart SNUMOPT (aliased as NUMOPT in common.h) is not used anywhere at all. 
-SYMV_P is only used in the generic kernels for the symv and chemv/zhemv functions - at least some of those are usually overridden by cpu-specific implementations, so if you start by cloning the existing implementation for a related cpu you need to check its KERNEL file to see if tuning SYMV_P would have any effect at all. 
-GEMV_UNROLL is only used by some older x86_64 kernels, so not all sections in param.h define it.
-Similarly, not all of the cpu parameters like L2 or L3 cache sizes are necessarily used in current kernels for a given model - by all indications the cpu identification code was imported from some other project originally.
+* Write optimized assembly kernels. Consider instruction pipeline, available registers, memory/cache access.
+* Tune cache block sizes (`Mc`, `Kc`, and `Nc`)

-## Run OpenBLAS Test
+Note that not all of the CPU-specific parameters in `param.h` are actively used in algorithms.
+`DNUMOPT` only appears as a scale factor in profiling output of the level3 `syrk` interface code,
+while its counterpart `SNUMOPT` (aliased as `NUMOPT` in `common.h`) is not used anywhere at all. 

-We use netlib blas test, cblas test, and LAPACK test. Meanwhile, we use [BLAS-Tester](https://github.com/xianyi/BLAS-Tester), a modified test tool from ATLAS.
+`SYMV_P` is only used in the generic kernels for the `symv` and `chemv`/`zhemv` functions -
+at least some of those are usually overridden by CPU-specific implementations, so if you start
+by cloning the existing implementation for a related CPU you need to check its `KERNEL` file
+to see if tuning `SYMV_P` would have any effect at all.

-* Run `test` and `ctest` at OpenBLAS. e.g. `make test` or `make ctest`.
-* Run regression test `utest` at OpenBLAS.
-* Run LAPACK test. e.g. `make lapack-test`.
-* Clone [BLAS-Tester](https://github.com/xianyi/BLAS-Tester), which can compare the OpenBLAS result with netlib reference BLAS.
+`GEMV_UNROLL` is only used by some older x86-64 kernels, so not all sections in `param.h` define it.
+Similarly, not all of the CPU parameters like L2 or L3 cache sizes are necessarily used in current
+kernels for a given model - by all indications the CPU identification code was imported from some
+other project originally.
+
+
+## Running OpenBLAS tests
+
+We use tests for Netlib BLAS, CBLAS, and LAPACK. In addition, we use
+OpenBLAS-specific regression tests. They can be run with Make:
+
+* `make -C test` for BLAS tests
+* `make -C ctest` for CBLAS tests
+* `make -C utest` for OpenBLAS regression tests
+*  `make lapack-test` for LAPACK tests
+
+We also use the [BLAS-Tester](https://github.com/xianyi/BLAS-Tester) tests for regression testing.
+It is basically the ATLAS test suite adapted for building with OpenBLAS.
+
+The project makes use of several Continuous Integration (CI) services
+conveniently interfaced with GitHub to automatically run tests on a number of
+platforms and build configurations.
+
+Also note that the test suites included with "numerically heavy" projects like
+Julia, NumPy, SciPy, Octave or QuantumEspresso can be used for regression
+testing, when those projects are built such that they use OpenBLAS.

-The project makes use of several Continuous Integration (CI) services conveniently interfaced with github to automatically check compilability on a number of platforms.
-Lastly, the testsuites included with "numerically heavy" projects like Julia, NumPy, Octave or QuantumEspresso can be used for regression testing.

 ## Benchmarking

-Several simple C benchmarks for performance testing individual BLAS functions are available in the `benchmark` folder, and its `scripts` subdirectory contains corresponding versions for Python, Octave and R.
-Other options include
+A number of benchmarking methods are used by OpenBLAS:

-* https://github.com/RoyiAvital/MatlabJuliaMatrixOperationsBenchmark (various matrix operations in Julia and Matlab)
-* https://github.com/mmperf/mmperf/ (single-core matrix multiplication)
+- Several simple C benchmarks for performance testing individual BLAS functions
+  are available in the `benchmark` folder. They can be run locally through the
+  `Makefile` in that directory. And the `benchmark/scripts` subdirectory
+  contains similar benchmarks that use OpenBLAS via NumPy, SciPy, Octave and R.
+- On pull requests, a representative set of functions is tested for performance
+  regressions with Codspeed; results can be viewed at
+  [https://codspeed.io/OpenMathLib/OpenBLAS](https://codspeed.io/OpenMathLib/OpenBLAS).
+- The [OpenMathLib/BLAS-Benchmarks](https://github.com/OpenMathLib/BLAS-Benchmarks) repository
+  contains an [Airspeed Velocity](https://github.com/airspeed-velocity/asv/)-based benchmark
+  suite which is run on several CPU architectures in cron jobs. Results are published
+  to a dashboard: [http://www.openmathlib.org/BLAS-Benchmarks/](http://www.openmathlib.org/BLAS-Benchmarks/).

-## Adding autodetection support for a new revision or variant of a supported cpu 
+Benchmarking code for BLAS libraries, and specific performance analysis results, can be found
+in a number of places. For example:

-Especially relevant for x86_64, a new cpu model may be a "refresh" (die shrink and/or different number of cores) within an existing
-model family without significant changes to its instruction set. (e.g. Intel Skylake, Kaby Lake etc. still are fundamentally Haswell,
-low end Goldmont etc. are Nehalem). In this case, compilation with the appropriate older TARGET will already lead to a satisfactory build.
+* [MatlabJuliaMatrixOperationsBenchmark](https://github.com/RoyiAvital/MatlabJuliaMatrixOperationsBenchmark)
+  (various matrix operations in Julia and Matlab)
+* [mmperf/mmperf](https://github.com/mmperf/mmperf/) (single-core matrix multiplication)
+
+
+## Adding autodetection support for a new revision or variant of a supported CPU 
+
+Especially relevant for x86-64, a new CPU model may be a "refresh" (die shrink and/or different number of cores) within an existing
+model family without significant changes to its instruction set (e.g., Intel Skylake and Kaby Lake still are fundamentally the same architecture as Haswell,
+low end Goldmont etc. are Nehalem). In this case, compilation with the appropriate older `TARGET` will already lead to a satisfactory build.

 To achieve autodetection of the new model, its CPUID (or an equivalent identifier) needs to be added in the `cpuid_<architecture>.c`
-relevant for its general architecture, with the returned name for the new type set appropriately. For x86 which has the most complex
-cpuid file, there are two functions that need to be edited - get_cpuname() to return e.g. CPUTYPE_HASWELL and get_corename() for the (broader)
-core family returning e.g. CORE_HASWELL. (This information ends up in the Makefile.conf and config.h files generated by `getarch`. Failure to
-set either will typically lead to a missing definition of the GEMM_UNROLL parameters later in the build, as `getarch_2nd` will be unable to
-find a matching parameter section in param.h.)
+relevant for its general architecture, with the returned name for the new type set appropriately. For x86, which has the most complex
+`cpuid` file, there are two functions that need to be edited: `get_cpuname()` to return, e.g., `CPUTYPE_HASWELL` and `get_corename()` for the (broader)
+core family returning, e.g., `CORE_HASWELL`.[^1]

-For architectures where "DYNAMIC_ARCH" builds are supported, a similar but simpler code section for the corresponding runtime detection of the cpu exists in `driver/others/dynamic.c` (for x86) and `driver/others/dynamic_<arch>.c` for other architectures.  
+[^1]:
+    This information ends up in the `Makefile.conf` and `config.h` files generated by `getarch`. Failure to
+    set either will typically lead to a missing definition of the `GEMM_UNROLL` parameters later in the build,
+    as `getarch_2nd` will be unable to find a matching parameter section in `param.h`.
+
+For architectures where `DYNAMIC_ARCH` builds are supported, a similar but simpler code section for the corresponding
+runtime detection of the CPU exists in `driver/others/dynamic.c` (for x86), and `driver/others/dynamic_<arch>.c` for other architectures.
 Note that for x86 the CPUID is compared after splitting it into its family, extended family, model and extended model parts, so the single decimal
-number returned by Linux in /proc/cpuinfo for the model has to be converted back to hexadecimal before splitting into its constituent
-digits, e.g. 142 = 8E , translates to extended model 8, model 14.
+number returned by Linux in `/proc/cpuinfo` for the model has to be converted back to hexadecimal before splitting into its constituent
+digits. For example, `142 == 8E` translates to extended model 8, model 14.
 
-## Adding dedicated support for a new cpu model

-Usually it will be possible to start from an existing model, clone its KERNEL configuration file to the new name to use for this TARGET and eventually replace individual kernels with versions better suited for peculiarities of the new cpu model. In addition, it is necessary to add
-(or clone at first) the corresponding section of GEMM_UNROLL parameters in the toplevel param.h, and possibly to add definitions such as USE_TRMM
-(governing whether TRMM functions use the respective GEMM kernel or a separate source file) to the Makefiles (and CMakeLists.txt) in the kernel
-directory. The new cpu name needs to be added to TargetLists.txt and the cpu autodetection code used by the `getarch` helper program - contained in
+## Adding dedicated support for a new CPU model
+
+Usually it will be possible to start from an existing model, clone its `KERNEL` configuration file to the new name to use for this
+`TARGET` and eventually replace individual kernels with versions better suited for peculiarities of the new CPU model.
+In addition, it is necessary to add (or clone at first) the corresponding section of `GEMM_UNROLL` parameters in the top-level `param.h`,
+and possibly to add definitions such as `USE_TRMM` (governing whether `TRMM` functions use the respective `GEMM` kernel or a separate source file)
+to the `Makefile`s (and `CMakeLists.txt`) in the kernel directory. The new CPU name needs to be added to `TargetList.txt`,
+and the CPU auto-detection code used by the `getarch` helper program - contained in
 the `cpuid_<architecture>.c` file amended to include the CPUID (or equivalent) information processing required (see preceding section).

+
 ## Adding support for an entirely new architecture

-This endeavour is best started by cloning the entire support structure for 32bit ARM, and within that the ARMV5 cpu in particular as this is implemented through plain C kernels only. An example providing a convenient "shopping list" can be seen in pull request #1526.
+This endeavour is best started by cloning the entire support structure for 32-bit ARM, and within that the ARMv5 CPU in particular,
+as this is implemented through plain C kernels only. An example providing a convenient "shopping list" can be seen in pull request
+[#1526](https://github.com/OpenMathLib/OpenBLAS/pull/1526).
--- a/docs/distributing.md
+++ b/docs/distributing.md
@ -49,7 +49,7 @@ settings):
   to provide an ILP64 interface build as well, use a symbol suffix to avoid
   symbol name clashes (see the next section).

-[^1] All major distributions do include LAPACK as of mid 2023 as far as we
+[^1]: All major distributions do include LAPACK as of mid 2023 as far as we
 know. Older versions of Arch Linux did not, and that was known to cause
 problems.

--- a/docs/extensions.md
+++ b/docs/extensions.md
@ -1,4 +1,9 @@
-* BLAS-like extensions
+OpenBLAS for the most part contains implementations of the reference (Netlib)
+BLAS, CBLAS, LAPACK and LAPACKE interfaces. A few OpenBLAS-specific functions
+are also provided however, which mostly can be seen as "BLAS extensions".
+This page documents those non-standard APIs.
+
+## BLAS-like extensions

 | Routine       | Data Types    | Description     |
 | ------------- |:------------- | :---------------|
@ -9,20 +14,26 @@
 | ?geadd        | s,d,c,z       | matrix add   |
 | ?gemmt        | s,d,c,z       | gemm but only a triangular part updated|

-* BLAS-like and Conversion functions for bfloat16 (available when OpenBLAS was compiled with BUILD_BFLOAT16=1)
-  * `void cblas_sbstobf16` converts a float array to an array of bfloat16 values by rounding
-  * `void cblas_sbdtobf16` converts a double array to an array of bfloat16 values by rounding
-  * `void cblas_sbf16tos` converts a bfloat16 array to an array of floats
-  * `void cblas_dbf16tod` converts a bfloat16 array to an array of doubles
-  * `float cblas_sbdot` computes the dot product of two bfloat16 arrays
-  * `void cblas_sbgemv` performs the matrix-vector operations of GEMV with the input matrix and X vector as bfloat16  
-  * `void cblas_sbgemm` performs the matrix-matrix operations of GEMM with both input arrays containing bfloat16

-* Utility functions
-  * openblas_get_num_threads
-  * openblas_set_num_threads
-  * `int openblas_get_num_procs(void)` returns the number of processors available on the system (may include "hyperthreading cores")
-  * `int openblas_get_parallel(void)` returns 0 for sequential use, 1 for platform-based threading and 2 for OpenMP-based threading
-  * `char * openblas_get_config()` returns the options OpenBLAS was built with, something like `NO_LAPACKE DYNAMIC_ARCH NO_AFFINITY Haswell`
-  * `int openblas_set_affinity(int thread_index, size_t cpusetsize, cpu_set_t *cpuset)` sets the cpu affinity mask of the given thread to the provided cpuset. (Only available under Linux, with semantics identical to pthread_setaffinity_np)
+## bfloat16 functionality
+
+BLAS-like and conversion functions for `bfloat16` (available when OpenBLAS was compiled with `BUILD_BFLOAT16=1`):
+
+* `void cblas_sbstobf16` converts a float array to an array of bfloat16 values by rounding
+* `void cblas_sbdtobf16` converts a double array to an array of bfloat16 values by rounding
+* `void cblas_sbf16tos` converts a bfloat16 array to an array of floats
+* `void cblas_dbf16tod` converts a bfloat16 array to an array of doubles
+* `float cblas_sbdot` computes the dot product of two bfloat16 arrays
+* `void cblas_sbgemv` performs the matrix-vector operations of GEMV with the input matrix and X vector as bfloat16
+* `void cblas_sbgemm` performs the matrix-matrix operations of GEMM with both input arrays containing bfloat16
+
+## Utility functions
+
+* `openblas_get_num_threads`
+* `openblas_set_num_threads`
+* `int openblas_get_num_procs(void)` returns the number of processors available on the system (may include "hyperthreading cores")
+* `int openblas_get_parallel(void)` returns 0 for sequential use, 1 for platform-based threading and 2 for OpenMP-based threading
+* `char * openblas_get_config()` returns the options OpenBLAS was built with, something like `NO_LAPACKE DYNAMIC_ARCH NO_AFFINITY Haswell`
+* `int openblas_set_affinity(int thread_index, size_t cpusetsize, cpu_set_t *cpuset)` sets the CPU affinity mask of the given thread
+  to the provided cpuset. Only available on Linux, with semantics identical to `pthread_setaffinity_np`.

--- a/docs/install.md
+++ b/docs/install.md
--- a/docs/user_manual.md
+++ b/docs/user_manual.md
@ -1,70 +1,174 @@
-## Compile the library
+
+This user manual covers compiling OpenBLAS itself, linking your code to OpenBLAS,
+example code to use the C (CBLAS) and Fortran (BLAS) APIs, and some troubleshooting
+tips. Compiling OpenBLAS is optional, since you may be able to install with a
+package manager.
+
+!!! Note BLAS API reference documentation
+
+    The OpenBLAS documentation does not contain API reference documentation for
+    BLAS or LAPACK, since these are standardized APIs, the documentation for
+    which can be found in other places. If you want to understand every BLAS
+    and LAPACK function and definition, we recommend reading the
+    [Netlib BLAS ](http://netlib.org/blas/) and [Netlib LAPACK](http://netlib.org/lapack/)
+    documentation.
+
+    OpenBLAS does contain a limited number of functions that are non-standard,
+    these are documented at [OpenBLAS extension functions](extensions.md).
+
+
+## Compiling OpenBLAS
+
 ### Normal compile
-  * type `make` to detect the CPU automatically.
-  or
-  * type `make TARGET=xxx` to set target CPU, e.g. `make TARGET=NEHALEM`. The full target list is in file TargetList.txt.
+
+The default way to build and install OpenBLAS from source is with Make:
+```
+make  # add `-j4` to compile in parallel with 4 processes
+make install
+```
+
+By default, the CPU architecture is detected automatically when invoking
+`make`, and the build is optimized for the detected CPU. To override the
+autodetection, use the `TARGET` flag:
+
+```
+# `make TARGET=xxx` sets target CPU: e.g. for an Intel Nehalem CPU:
+make TARGET=NEHALEM
+```
+The full list of known target CPU architectures can be found in
+`TargetList.txt` in the root of the repository.

 ### Cross compile
-Please set `CC` and `FC` with the cross toolchains. Then, set `HOSTCC` with your host C compiler. At last, set `TARGET` explicitly.

-Examples:
+For a basic cross-compilation with Make, three steps need to be taken:

-* On x86 box, compile the library for ARM Cortex-A9 linux.
+- Set the `CC` and `FC` environment variables to select the cross toolchains
+  for C and Fortran.
+- Set the `HOSTCC` environment variable to select the host C compiler (i.e. the
+  regular C compiler for the machine on which you are invoking the build).
+- Set `TARGET` explicitly to the CPU architecture on which the produced
+  OpenBLAS binaries will be used.

-Install only gnueabihf versions. Please check https://github.com/xianyi/OpenBLAS/issues/936#issuecomment-237596847
+#### Cross-compilation examples

-    make CC=arm-linux-gnueabihf-gcc FC=arm-linux-gnueabihf-gfortran HOSTCC=gcc TARGET=CORTEXA9
-
-* On X86 box, compile this library for loongson3a CPU.
+Compile the library for ARM Cortex-A9 linux on an x86-64 machine
+_(note: install only `gnueabihf` versions of the cross toolchain - see
+[this issue comment](https://github.com/OpenMathLib/OpenBLAS/issues/936#issuecomment-237596847)
+for why_):
+```
+make CC=arm-linux-gnueabihf-gcc FC=arm-linux-gnueabihf-gfortran HOSTCC=gcc TARGET=CORTEXA9
+```

+Compile OpenBLAS for a loongson3a CPU on an x86-64 machine:
 ```
 make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
 ```

-* On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
-
+Compile OpenBLAS for loongson3a CPU with the `loongcc` (based on Open64) compiler on an x86-64 machine:
 ```
 make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu-   NO_LAPACKE=1 NO_SHARED=1 BINARY=32
 ```

-### Debug version
+### Building a debug version

-    make DEBUG=1
+Add `DEBUG=1` to your build command, e.g.:
+```
+make DEBUG=1
+```

-### Install to the directory (optional)
+### Install to a specific directory

-Example:
+!!! note

-    make install PREFIX=your_installation_directory
+    Installing to a directory is optional; it is also possible to use the shared or static
+    libraries directly from the build directory.

-The default directory is /opt/OpenBLAS. Note that any flags passed to `make` during build should also be passed to `make install` to circumvent any install errors, i.e. some headers not being copied over correctly.
+Use `make install` with the `PREFIX` flag to install to a specific directory:

-For more information, please read [Installation Guide](install.md).
+```
+make install PREFIX=/path/to/installation/directory
+```

-## Link the library
+The default directory is `/opt/OpenBLAS`.

-* Link shared library
+!!! important

+    Note that any flags passed to `make` during build should also be passed to
+    `make install` to circumvent any install errors, i.e. some headers not
+    being copied over correctly.
+
+For more detailed information on building/installing from source, please read
+the [Installation Guide](install.md).
+
+
+## Linking to OpenBLAS
+
+OpenBLAS can be used as a shared or a static library.
+
+### Link a shared library
+
+The shared library is normally called `libopenblas.so`, but not that the name
+may be different as a result of build flags used or naming choices by a distro
+packager (see [distributing.md] for details). To link a shared library named
+`libopenblas.so`, the flag `-lopenblas` is needed. To find the OpenBLAS headers,
+a `-I/path/to/includedir` is needed. And unless the library is installed in a
+directory that the linker searches by default, also `-L` and `-Wl,-rpath` flags
+are needed. For a source file `test.c` (e.g., the example code under _Call
+CBLAS interface_ further down), the shared library can then be linked with:
 ```
 gcc -o test test.c -I/your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -Wl,-rpath,/your_path/OpenBLAS/lib -lopenblas
 ```

-The `-Wl,-rpath,/your_path/OpenBLAS/lib` option to linker can be omitted if you ran `ldconfig` to update linker cache, put `/your_path/OpenBLAS/lib` in `/etc/ld.so.conf` or a file in `/etc/ld.so.conf.d`, or installed OpenBLAS in a location part of `ld.so` default search path. Otherwise, linking at runtime will fail.
+The `-Wl,-rpath,/your_path/OpenBLAS/lib` linker flag can be omitted if you
+ran `ldconfig` to update linker cache, put `/your_path/OpenBLAS/lib` in
+`/etc/ld.so.conf` or a file in `/etc/ld.so.conf.d`, or installed OpenBLAS in a
+location that is part of the `ld.so` default search path (usually `/lib`,
+`/usr/lib` and `/usr/local/lib`). Alternatively, you can set the environment
+variable `LD_LIBRARY_PATH` to point to the folder that contains `libopenblas.so`.
+Otherwise, the build may succeed but at runtime loading the library will fail
+with a message like:
+```
+cannot open shared object file: no such file or directory
+```

-If the library is multithreaded, please add `-lpthread`. If the library contains LAPACK functions, please add `-lgfortran` or other Fortran libs, although if you only make calls to LAPACKE routines, i.e. your code has `#include "lapacke.h"` and makes calls to methods like `LAPACKE_dgeqrf`, `-lgfortran` is not needed.
+More flags may be needed, depending on how OpenBLAS was built:

-* Link static library
+- If `libopenblas` is multi-threaded, please add `-lpthread`.
+- If the library contains LAPACK functions (usually also true), please add
+  `-lgfortran` (other Fortran libraries may also be needed, e.g. `-lquadmath`).
+  Note that if you only make calls to LAPACKE routines, i.e. your code has
+  `#include "lapacke.h"` and makes calls to methods like `LAPACKE_dgeqrf`,
+  then `-lgfortran` is not needed.

+!!! tip Use pkg-config
+
+    Usually a pkg-config file (e.g., `openblas.pc`) is installed together
+    with a `libopenblas` shared library. pkg-config is a tool that will
+    tell you the exact flags needed for linking. For example:
+
+    ```
+    $ pkg-config --cflags openblas
+    -I/usr/local/include
+    $ pkg-config --libs openblas
+    -L/usr/local/lib -lopenblas
+    ```
+
+### Link a static library
+
+Linking a static library is simpler - add the path to the static OpenBLAS
+library to the compile command:
 ```
 gcc -o test test.c /your/path/libopenblas.a
 ```

-You can download `test.c` from https://gist.github.com/xianyi/5780018 

 ## Code examples

 ### Call CBLAS interface
-This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
+
+This example shows calling `cblas_dgemm` in C:
+
+<!-- Source: https://gist.github.com/xianyi/6930656 -->
 ```c
 #include <cblas.h>
 #include <stdio.h>
@ -83,14 +187,17 @@ void main()
 }
 ```

+To compile this file, save it as `test_cblas_dgemm.c` and then run:
 ```
-gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran
+gcc -o test_cblas_open test_cblas_dgemm.c -I/your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran
 ```
+will result in a `test_cblas_open` executable.

 ### Call BLAS Fortran interface

-This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
+This example shows calling the `dgemm` Fortran interface in C:

+<!-- Source: https://gist.github.com/xianyi/5780018 -->
 ```c
 #include "stdio.h"
 #include "stdlib.h"
@ -158,22 +265,41 @@ int main(int argc, char* argv[])
 }
 ```

+To compile this file, save it as `time_dgemm.c` and then run:
 ```
-gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a
-./time_dgemm <m> <n> <k>
+gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a -lpthread
 ```
+You can then run it as: `./time_dgemm <m> <n> <k>`, with `m`, `n`, and `k` input
+parameters to the `time_dgemm` executable.
+
+!!! note
+
+    When calling the Fortran interface from C, you have to deal with symbol name
+    differences caused by compiler conventions. That is why the `dgemm_` function
+    call in the example above has a trailing underscore. This is what it looks like
+    when using `gcc`/`gfortran`, however such details may change for different
+    compilers. Hence it requires extra support code. The CBLAS interface may be
+    more portable when writing C code.
+
+    When writing code that needs to be portable and work across different
+    platforms and compilers, the above code example is not recommended for
+    usage. Instead, we advise looking at how OpenBLAS (or BLAS in general, since
+    this problem isn't specific to OpenBLAS) functions are called in widely
+    used projects like Julia, SciPy, or R.
+

 ## Troubleshooting

-* Please read [Faq](faq.md) at first.
-* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
-* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
-* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
-* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
-* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
-
-## BLAS reference manual
-
-If you want to understand every BLAS function and definition, please read [Intel MKL reference manual](https://software.intel.com/en-us/intel-mkl/documentation) or [netlib.org](http://netlib.org/blas/)
-
-Here are [OpenBLAS extension functions](extensions.md)
+* Please read the [FAQ](faq.md) first, your problem may be described there.
+* Please ensure you are using a recent enough compiler, that supports the
+  features your CPU provides (example: GCC versions before 4.6 were known to
+  not support AVX kernels, and before 6.1 AVX512CD kernels).
+* The number of CPU cores supported by default is <=256. On Linux x86-64, there
+  is experimental support for up to 1024 cores and 128 NUMA nodes if you build
+  the library with `BIGNUMA=1`.
+* OpenBLAS does not set processor affinity by default. On Linux, you can enable
+  processor affinity by commenting out the line `NO_AFFINITY=1` in
+  `Makefile.rule`.
+* On Loongson 3A, `make test` is known to fail with a `pthread_create` error
+  and an `EAGAIN` error code. However, it will be OK when you run the same
+  testcase in a shell.
--- a/driver/level3/CMakeLists.txt
+++ b/driver/level3/CMakeLists.txt
@ -68,6 +68,8 @@ if (USE_THREAD)
 endif ()

 foreach (float_type ${FLOAT_TYPES})
+  GenerateNamedObjects("gemm_batch_thread.c" "" "gemm_batch_thread" 0 "" "" false ${float_type})
+
  if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
    GenerateCombinationObjects("zherk_kernel.c" "LOWER;CONJ" "U;N" "HERK" 2 "herk_kernel" false ${float_type})
    # TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@ -37,7 +37,7 @@ SBLASOBJS	+= \
 	ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \
 	ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \
 	ssyrk_kernel_U.$(SUFFIX)  ssyrk_kernel_L.$(SUFFIX) \
-	ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX)
+	ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) sgemm_batch_thread.$(SUFFIX)

 DBLASOBJS	+= \
 	dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \
@ -53,7 +53,7 @@ DBLASOBJS	+= \
 	dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \
 	dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \
 	dsyrk_kernel_U.$(SUFFIX)  dsyrk_kernel_L.$(SUFFIX) \
-	dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX)
+	dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) dgemm_batch_thread.$(SUFFIX)

 QBLASOBJS	+= \
 	qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \
@ -103,7 +103,7 @@ CBLASOBJS	+= \
 	cherk_kernel_LN.$(SUFFIX)  cherk_kernel_LC.$(SUFFIX) \
 	csyr2k_kernel_U.$(SUFFIX)  csyr2k_kernel_L.$(SUFFIX) \
 	cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \
-	cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX)
+	cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) cgemm_batch_thread.$(SUFFIX)

 ZBLASOBJS	+= \
 	zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \
@ -137,7 +137,7 @@ ZBLASOBJS	+= \
 	zherk_kernel_LN.$(SUFFIX)  zherk_kernel_LC.$(SUFFIX) \
 	zsyr2k_kernel_U.$(SUFFIX)  zsyr2k_kernel_L.$(SUFFIX) \
 	zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \
-	zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX)
+	zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) zgemm_batch_thread.$(SUFFIX)


 XBLASOBJS	+= \
@ -2942,6 +2942,21 @@ gemm_thread_variable.$(PSUFFIX) : gemm_thread_variable.c ../../common.h
 beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h
 	$(CC) -c $(PFLAGS) $< -o $(@F)

+sbgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+	
+sgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+dgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+cgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+zgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+

 sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
--- a/driver/level3/gemm_batch_thread.c
+++ b/driver/level3/gemm_batch_thread.c
@ -0,0 +1,156 @@
+/*****************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+void openblas_warning(int verbose, const char * msg);
+
+#ifdef SMALL_MATRIX_OPT
+static int inner_small_matrix_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
+  int routine_mode;
+#ifndef COMPLEX
+  int (*gemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG);
+  int (*gemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG);
+#else
+  int (*zgemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG);
+  int (*zgemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG);
+  FLOAT alpha[2], beta[2];
+#endif
+  routine_mode=args->routine_mode;
+  if((routine_mode & BLAS_SMALL_B0_OPT) == BLAS_SMALL_B0_OPT){
+#ifndef COMPLEX
+    gemm_small_kernel_b0=args->routine;
+    gemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, args->c, args->ldc);
+#else
+    zgemm_small_kernel_b0=args->routine;
+    alpha[0] = *((FLOAT *)args -> alpha + 0);
+    alpha[1] = *((FLOAT *)args -> alpha + 1);
+    zgemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, args->c, args->ldc);
+#endif
+    return(0);
+  }else if(routine_mode & BLAS_SMALL_OPT){
+#ifndef COMPLEX
+    gemm_small_kernel=args->routine;
+    gemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, *(FLOAT *)(args->beta), args->c, args->ldc);
+#else
+    zgemm_small_kernel=args->routine;
+    alpha[0] = *((FLOAT *)args -> alpha + 0);
+    alpha[1] = *((FLOAT *)args -> alpha + 1);
+    beta[0] = *((FLOAT *)args -> beta + 0);
+    beta[1] = *((FLOAT *)args -> beta + 1);
+    zgemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, beta[0], beta[1], args->c, args->ldc);
+#endif    
+    return(0);
+  }
+  return(1);
+}
+#endif
+
+int CNAME(blas_arg_t * args_array, BLASLONG nums){
+  XFLOAT *buffer;
+  XFLOAT *sa, *sb;
+  int nthreads=1;
+  int (*routine)(blas_arg_t *, void *, void *, XFLOAT *, XFLOAT *, BLASLONG);
+  int i=0, /*j,*/ current_nums;
+
+#ifdef SMP
+  blas_queue_t * queue=NULL;
+#endif
+  
+  if(nums <=0 ) return 0;
+
+  buffer = (XFLOAT *)blas_memory_alloc(0);
+  sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
+  sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+  
+#ifdef SMP
+  nthreads=num_cpu_avail(3);
+
+  if(nthreads==1){
+
+#endif
+    //single thread
+    for(i=0; i<nums; i++){
+      routine=args_array[i].routine;
+#ifdef SMALL_MATRIX_OPT
+      if(args_array[i].routine_mode & BLAS_SMALL_OPT){
+	inner_small_matrix_thread(&args_array[i], NULL, NULL, NULL, NULL, 0);
+      }else{
+#endif      
+	routine(&args_array[i], NULL, NULL, sa, sb, 0);
+#ifdef SMALL_MATRIX_OPT
+      }
+#endif
+    }
+#ifdef SMP
+  } else {
+    //multi thread
+
+    queue=(blas_queue_t *)malloc((nums+1) * sizeof(blas_queue_t));
+    if(queue == NULL){
+      openblas_warning(0, "memory alloc failed!\n");
+      return(1);
+    }
+    for(i=0; i<nums; i++){
+      queue[i].args=&args_array[i];
+      queue[i].range_m=NULL;
+      queue[i].range_n=NULL;
+      queue[i].sa=NULL;
+      queue[i].sb=NULL;
+      queue[i].next=&queue[i+1];
+
+      queue[i].mode=args_array[i].routine_mode;
+      queue[i].routine=args_array[i].routine;
+      
+#ifdef SMALL_MATRIX_OPT
+      if((args_array[i].routine_mode & BLAS_SMALL_B0_OPT) || (args_array[i].routine_mode & BLAS_SMALL_OPT)){
+	queue[i].routine=inner_small_matrix_thread;
+      }
+#endif
+    }
+    
+    for(i=0; i<nums; i+=nthreads){
+      current_nums=((nums-i)>nthreads)? nthreads: (nums-i);
+
+      queue[i].sa=sa;
+      queue[i].sb=sb;
+      queue[i+current_nums-1].next=NULL;
+      
+      exec_blas(current_nums, &queue[i]);
+    }
+    free(queue);
+  }
+#endif
+  blas_memory_free(buffer);
+  return 0;
+}
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@ -570,6 +570,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
 #else
  static pthread_mutex_t  level3_lock    = PTHREAD_MUTEX_INITIALIZER;
+  static pthread_cond_t  level3_wakeup    = PTHREAD_COND_INITIALIZER;
+  volatile static BLASLONG CPU_AVAILABLE = MAX_CPU_NUMBER;
 #endif

  blas_arg_t newarg;
@ -639,6 +641,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
 #else
  pthread_mutex_lock(&level3_lock);
+  while(CPU_AVAILABLE < nthreads) { 
+    pthread_cond_wait(&level3_wakeup, &level3_lock);
+  } 
+  CPU_AVAILABLE -= nthreads;
+  WMB;
+  pthread_mutex_unlock(&level3_lock);
 #endif

 #ifdef USE_ALLOC_HEAP
@ -734,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
    num_parts  = 0;
    while (n > 0){
      width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
-      if (width < switch_ratio) {
+      if (width < switch_ratio && width > 1) {
        width = switch_ratio;
      }
      width = round_up(n, width, GEMM_PREFERED_SIZE);
@ -783,6 +791,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
 #elif defined(OS_WINDOWS)
  LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
 #else
+  pthread_mutex_lock(&level3_lock);
+  CPU_AVAILABLE += nthreads;
+  WMB;
+  pthread_cond_signal(&level3_wakeup);
  pthread_mutex_unlock(&level3_lock);
 #endif

@ -826,6 +838,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
    if (nthreads_m * nthreads_n > args -> nthreads) {
      nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
    }
+    /* The nthreads_m and nthreads_n are adjusted so that the submatrix       */
+    /* to be handled by each thread preferably becomes a square matrix        */
+    /* by minimizing an objective function 'n * nthreads_m + m * nthreads_n'. */
+    /* Objective function come from sum of partitions in m and n.             */
+    /* (n / nthreads_n) + (m / nthreads_m)                                    */
+    /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m)        */
+    while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) {
+      nthreads_m /= 2;
+      nthreads_n *= 2;
+    }
  }

  /* Execute serial or parallel computation */
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@ -25,6 +25,7 @@ if (USE_THREAD)
    ${BLAS_SERVER}
    divtable.c # TODO: Makefile has -UDOUBLE
    blas_l1_thread.c
+    blas_server_callback.c
  )

  if (NOT NO_AFFINITY)
@ -51,6 +52,10 @@ if (DYNAMIC_ARCH)
    list(APPEND COMMON_SOURCES dynamic_arm64.c)
  elseif (POWER)
    list(APPEND COMMON_SOURCES dynamic_power.c)
+  elseif (RISCV64)
+    list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
+  elseif (LOONGARCH64)
+    list(APPEND COMMON_SOURCES dynamic_loongarch64.c)
  else ()  
    list(APPEND COMMON_SOURCES dynamic.c)
  endif ()  
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@ -6,7 +6,7 @@ COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
 #COMMONOBJS	+= slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX)  dlamc3.$(SUFFIX)

 ifdef SMP
-COMMONOBJS	+= blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
+COMMONOBJS	+= blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) blas_server_callback.$(SUFFIX)
 ifneq ($(NO_AFFINITY), 1)
 COMMONOBJS	+= init.$(SUFFIX)
 endif
@ -30,12 +30,16 @@ else
 ifeq ($(ARCH),loongarch64)
 COMMONOBJS += dynamic_loongarch64.$(SUFFIX)
 else
+ifeq ($(ARCH),riscv64)
+COMMONOBJS += dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX)
+else
 COMMONOBJS	+=  dynamic.$(SUFFIX)
 endif
 endif
 endif
 endif
 endif
+endif
 else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
@ -106,12 +110,16 @@ else
 ifeq ($(ARCH),loongarch64)
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX)
 else
+ifeq ($(ARCH),riscv64)
+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX)
+else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
 endif
 endif
 endif
 endif
 endif
+endif
 else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
 endif
@ -140,6 +148,9 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h
 blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h
 	$(CC) $(CFLAGS) -c $< -o $(@F)

+blas_server_callback.$(SUFFIX) : blas_server_callback.c ../../common.h
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+	
 openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

@ -206,6 +217,9 @@ addx.$(SUFFIX) : $(ARCH)/addx.c
 mulx.$(SUFFIX) : $(ARCH)/mulx.c
 	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F)

+detect_riscv64.$(SUFFIX): detect_riscv64.c
+	$(CC) $(CFLAGS) -c -march=rv64imafdcv $< -o $(@F)
+
 xerbla.$(PSUFFIX) : xerbla.c
 	$(CC) $(PFLAGS) -c $< -o $(@F)

--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@ -115,6 +115,8 @@ int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;

 int blas_omp_threads_local = 1;

+static void * blas_thread_buffer[MAX_CPU_NUMBER];
+
 /* Local Variables */
 #if   defined(USE_PTHREAD_LOCK)
 static pthread_mutex_t  server_lock    = PTHREAD_MUTEX_INITIALIZER;
@ -190,6 +192,10 @@ static int main_status[MAX_CPU_NUMBER];
 BLASLONG	exit_time[MAX_CPU_NUMBER];
 #endif

+//Prototypes
+static void exec_threads(int , blas_queue_t *, int);
+static void adjust_thread_buffers();
+
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){

      if (!(mode & BLAS_COMPLEX)){
@ -375,7 +381,6 @@ static void* blas_thread_server(void *arg){
  /* Thread identifier */
  BLASLONG  cpu = (BLASLONG)arg;
  unsigned int last_tick;
-  void *buffer, *sa, *sb;
  blas_queue_t	*queue;

 blas_queue_t *tscq;
@ -395,8 +400,6 @@ blas_queue_t *tscq;
  main_status[cpu] = MAIN_ENTER;
 #endif

-  buffer = blas_memory_alloc(2);
-
 #ifdef SMP_DEBUG
  fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
 #endif
@ -415,7 +418,7 @@ blas_queue_t *tscq;

      tscq = atomic_load_queue(&thread_status[cpu].queue);

-	while(!tscq) {
+	while(!tscq || tscq == 0x1) {
 	YIELDING;

 	if ((unsigned int)rpcc() - last_tick > thread_timeout) {
@ -456,117 +459,9 @@ blas_queue_t *tscq;
    start = rpcc();
 #endif

-    if (queue) {
-      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine;
-
-      atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
-
-      sa = queue -> sa;
-      sb = queue -> sb;
-
-#ifdef SMP_DEBUG
-      if (queue -> args) {
-	fprintf(STDERR, "Server[%2ld] Calculation started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
-		cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
-      }
-#endif
-
-#ifdef CONSISTENT_FPCSR
-#ifdef __aarch64__
-      __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
-#else
-      __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
-      __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
-#endif
-#endif
-
-#ifdef MONITOR
-      main_status[cpu] = MAIN_RUNNING1;
-#endif
-
-//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an
-//offset to the buffer is essential for minimizing cache conflicts and optimizing performance.
-#if defined(LOONGSON3R5) && !defined(NO_AFFINITY)
-      char model_name[128];
-      get_cpu_model(model_name);
-      if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL))
-        if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
-#endif
-      if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
-
-      if (sb == NULL) {
-	if (!(queue -> mode & BLAS_COMPLEX)){
-#ifdef EXPRECISION
-	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
-	    sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
-					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-	  } else
-#endif
-	  if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
-#ifdef BUILD_DOUBLE
-	    sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
-					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-#endif
-	  } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
-#ifdef BUILD_SINGLE
-	    sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
-					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-#endif
-    } else {
-          /* Other types in future */
-      }
-	} else {
-#ifdef EXPRECISION
-	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
-	    sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
-					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-	  } else
-#endif
-	  if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
-#ifdef BUILD_COMPLEX16
-	    sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
-					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-#endif
-	  } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
-#ifdef BUILD_COMPLEX
-	    sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
-					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-#endif
-      } else {
-          /* Other types in future */
-      }
-	}
-	queue->sb=sb;
-      }
-
-#ifdef MONITOR
-	main_status[cpu] = MAIN_RUNNING2;
-#endif
-
-      if (queue -> mode & BLAS_LEGACY) {
-	legacy_exec(routine, queue -> mode, queue -> args, sb);
-      } else
-	if (queue -> mode & BLAS_PTHREAD) {
-	  void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine;
-	  (pthreadcompat)(queue -> args);
-	} else
-	  (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
-
-#ifdef SMP_DEBUG
-      fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
-#endif
-
-#ifdef MONITOR
-      main_status[cpu] = MAIN_FINISH;
-#endif
-
-      // arm: make sure all results are written out _before_
-      // thread is marked as done and other threads use them
-      MB;
-      atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0);
-
-
-    }
+  if(queue) {
+    exec_threads(cpu, queue, 0);
+  }

 #ifdef MONITOR
      main_status[cpu] = MAIN_DONE;
@ -588,8 +483,6 @@ blas_queue_t *tscq;
      fprintf(STDERR, "Server[%2ld] Shutdown!\n",  cpu);
 #endif

-  blas_memory_free(buffer);
-
  //pthread_exit(NULL);

  return NULL;
@ -671,6 +564,9 @@ int blas_thread_init(void){

  LOCK_COMMAND(&server_lock);

+  // Adjust thread buffers
+  adjust_thread_buffers();
+
  if (!blas_server_avail){

    thread_timeout_env=openblas_thread_timeout();
@ -699,6 +595,8 @@ int blas_thread_init(void){
 	struct rlimit rlim;
        const char *msg = strerror(ret);
        fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg);
+	fprintf(STDERR, "OpenBLAS blas_thread_init: ensure that your address space and process count limits are big enough (ulimit -a)\n");
+	fprintf(STDERR, "OpenBLAS blas_thread_init: or set a smaller OPENBLAS_NUM_THREADS to fit into what you have available\n");
 #ifdef RLIMIT_NPROC
        if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
          fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
@ -901,6 +799,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
  fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
 #endif

+//Redirect to caller's callback routine
+if (openblas_threads_callback_) {
+  int buf_index = 0, i = 0;
+#ifndef USE_SIMPLE_THREADED_LEVEL3
+    for (i = 0; i < num; i ++)
+      queue[i].position = i;
+#endif
+    openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index);
+    return 0;
+  }
+
+
 #ifdef __ELF__
  if (omp_in_parallel && (num > 1)) {
    if (omp_in_parallel() > 0) {
@ -1074,6 +984,14 @@ int BLASFUNC(blas_thread_shutdown)(void){

  LOCK_COMMAND(&server_lock);

+  //Free buffers allocated for threads
+  for(i=0; i<MAX_CPU_NUMBER; i++){
+    if(blas_thread_buffer[i]!=NULL){
+      blas_memory_free(blas_thread_buffer[i]);
+      blas_thread_buffer[i]=NULL;
+    }
+  }
+
  if (blas_server_avail) {

    for (i = 0; i < blas_num_threads - 1; i++) {
@ -1110,5 +1028,138 @@ int BLASFUNC(blas_thread_shutdown)(void){
  return 0;
 }

+static void adjust_thread_buffers() {
+
+  int i=0;
+
+  //adjust buffer for each thread
+  for(i=0; i < blas_cpu_number; i++){
+    if(blas_thread_buffer[i] == NULL){
+      blas_thread_buffer[i] = blas_memory_alloc(2);
+    }
+  }
+  for(; i < MAX_CPU_NUMBER; i++){
+    if(blas_thread_buffer[i] != NULL){
+      blas_memory_free(blas_thread_buffer[i]);
+      blas_thread_buffer[i] = NULL;
+    }
+  }
+}
+
+static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) {
+
+  int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine;
+
+  atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
+
+  void *buffer = blas_thread_buffer[cpu];
+  void *sa = queue -> sa;
+  void *sb = queue -> sb;
+
+#ifdef SMP_DEBUG
+    if (queue -> args) {
+fprintf(STDERR, "Server[%2ld] Calculation started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
+  cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
+    }
 #endif

+#ifdef CONSISTENT_FPCSR
+#ifdef __aarch64__
+    __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
+#else
+    __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
+    __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
+#endif
+#endif
+
+#ifdef MONITOR
+      main_status[cpu] = MAIN_RUNNING1;
+#endif
+
+if (buffer == NULL) {
+	blas_thread_buffer[cpu] = blas_memory_alloc(2);
+	buffer = blas_thread_buffer[cpu];
+}      
+
+	
+//For LOONGARCH64, applying an offset to the buffer is essential
+//for minimizing cache conflicts and optimizing performance.
+#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
+      if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
+#endif
+      if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
+
+    if (sb == NULL) {
+if (!(queue -> mode & BLAS_COMPLEX)){
+#ifdef EXPRECISION
+  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
+    sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
+        + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+  } else
+#endif
+  if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
+#ifdef BUILD_DOUBLE
+    sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+        + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
+  } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
+#ifdef BUILD_SINGLE
+    sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+        + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
+  } else {
+        /* Other types in future */
+    }
+} else {
+#ifdef EXPRECISION
+  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
+    sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
+        + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+  } else
+#endif
+  if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
+#ifdef BUILD_COMPLEX16
+    sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
+        + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
+  } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
+#ifdef BUILD_COMPLEX
+    sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
+        + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
+    } else {
+        /* Other types in future */
+    }
+}
+queue->sb=sb;
+    }
+
+#ifdef MONITOR
+main_status[cpu] = MAIN_RUNNING2;
+#endif
+
+    if (queue -> mode & BLAS_LEGACY) {
+legacy_exec(routine, queue -> mode, queue -> args, sb);
+    } else
+if (queue -> mode & BLAS_PTHREAD) {
+  void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine;
+  (pthreadcompat)(queue -> args);
+} else
+  (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
+
+#ifdef SMP_DEBUG
+    fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
+#endif
+
+#ifdef MONITOR
+    main_status[cpu] = MAIN_FINISH;
+#endif
+
+    // arm: make sure all results are written out _before_
+    // thread is marked as done and other threads use them
+    MB;
+    atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0);
+
+}
+
+#endif
--- a/driver/others/blas_server_callback.c
+++ b/driver/others/blas_server_callback.c
@ -0,0 +1,12 @@
+#include "common.h"
+
+/* global variable to change threading backend from openblas-managed to caller-managed */
+openblas_threads_callback openblas_threads_callback_ = 0;
+
+/* non-threadsafe function should be called before any other
+   openblas function to change how threads are managed */
+   
+void openblas_set_threads_callback_function(openblas_threads_callback callback)
+{
+  openblas_threads_callback_ = callback;
+}
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@ -114,9 +114,11 @@ void goto_set_num_threads(int num_threads) {

  adjust_thread_buffers();
 #if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64)
+#ifndef DYNAMIC_ARCH
  //set parameters for different number of threads.
  blas_set_parameter();
 #endif
+#endif

 }
 void openblas_set_num_threads(int num_threads) {
@ -285,7 +287,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
   }
 }

-static void exec_threads(blas_queue_t *queue, int buf_index){
+static void exec_threads(int thread_num, blas_queue_t *queue, int buf_index){

  void *buffer, *sa, *sb;
  int pos=0, release_flag=0;
@ -305,7 +307,7 @@ static void exec_threads(blas_queue_t *queue, int buf_index){

  if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {

-    pos = omp_get_thread_num();
+    pos= thread_num;
    buffer = blas_thread_buffer[buf_index][pos];

    //fallback
@ -420,18 +422,25 @@ while (true) {
        break;
      }
    }
-    if (i != MAX_PARALLEL_NUMBER)
-	    break;
-}
-if (openblas_omp_adaptive_env() != 0) {
-#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
-  for (i = 0; i < num; i ++) {
+    if(i != MAX_PARALLEL_NUMBER)
+      break;
+  }
+  /*For caller-managed threading, if caller has registered the callback, pass exec_thread as callback function*/
+  if (openblas_threads_callback_) {
+#ifndef USE_SIMPLE_THREADED_LEVEL3
+    for (i = 0; i < num; i ++)
+      queue[i].position = i;
+#endif
+    openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index);
+  } else {

+ if (openblas_omp_adaptive_env() != 0) {
+ #pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
+  for (i = 0; i < num; i ++) {
 #ifndef USE_SIMPLE_THREADED_LEVEL3
    queue[i].position = i;
 #endif
-
-    exec_threads(&queue[i], buf_index);
+  exec_threads(omp_get_thread_num(), &queue[i], buf_index);
  }
 } else {
 #pragma omp parallel for schedule(OMP_SCHED)
@ -441,9 +450,10 @@ if (openblas_omp_adaptive_env() != 0) {
    queue[i].position = i;
 #endif

-    exec_threads(&queue[i], buf_index);
+  exec_threads(omp_get_thread_num(), &queue[i], buf_index);
  }
 }
+}

 #ifdef HAVE_C11
  atomic_store(&blas_buffer_inuse[buf_index], false);
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
--- a/driver/others/detect_riscv64.c
+++ b/driver/others/detect_riscv64.c
@ -0,0 +1,75 @@
+/*****************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#include <stdint.h>
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+unsigned detect_riscv64_get_vlenb(void) {
+#ifdef __riscv_v_intrinsic
+	return __riscv_vlenb();
+#else
+	return 0;
+#endif
+}
+
+/*
+ * Based on the approach taken here:
+ * https://code.videolan.org/videolan/dav1d/-/merge_requests/1629
+ *
+ * Only to be called after we've determined we have some sort of
+ * RVV support.
+ */
+
+uint64_t detect_riscv64_rvv100(void)
+{
+	uint64_t rvv10_supported;
+
+	/*
+	 * After the vsetvli statement vtype will either be a value > 0 if the
+	 * vsetvli succeeded or less than 0 if it failed.  If 0 < vtype
+	 * we're good and the function will return 1, otherwise there's no
+	 * RVV 1.0 and we return 0.
+	 */
+
+	asm volatile("vsetvli x0, x0, e8, m1, ta, ma\n\t"
+		     "csrr %0, vtype\n\t"
+		     "slt %0, x0, %0\n"
+		     : "=r" (rvv10_supported)
+		     :
+		     :);
+
+	return rvv10_supported;
+}
+
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@ -927,6 +927,7 @@ static gotoblas_t *get_coretype(void){
 	case 0x7:
      switch (exmodel) {
      case 5:
+      case 6:
        if (support_avx2())
          return &gotoblas_ZEN;
        else
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@ -120,6 +120,11 @@ extern gotoblas_t  gotoblas_CORTEXA55;
 #else
 #define gotoblas_CORTEXA55 gotoblas_ARMV8
 #endif
+#ifdef DYN_A64FX
+extern gotoblas_t gotoblas_A64FX;
+#else
+#define gotoblas_A64FX gotoblas_ARMV8
+#endif
 #else
 extern gotoblas_t  gotoblas_CORTEXA53;
 #define gotoblas_CORTEXA55 gotoblas_CORTEXA53
@ -136,10 +141,12 @@ extern gotoblas_t  gotoblas_NEOVERSEN1;
 extern gotoblas_t  gotoblas_NEOVERSEV1;
 extern gotoblas_t  gotoblas_NEOVERSEN2;
 extern gotoblas_t  gotoblas_ARMV8SVE;
+extern gotoblas_t  gotoblas_A64FX;
 #else
 #define gotoblas_NEOVERSEV1 gotoblas_ARMV8
 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
 #define gotoblas_ARMV8SVE   gotoblas_ARMV8
+#define gotoblas_A64FX      gotoblas_ARMV8
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
 #endif
@ -149,7 +156,7 @@ extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"

-#define NUM_CORETYPES   17
+#define NUM_CORETYPES   18

 /*
 * In case asm/hwcap.h is outdated on the build system, make sure
@ -184,6 +191,7 @@ static char *corename[] = {
  "thunderx3t110",
  "cortexa55",
  "armv8sve",
+  "a64fx",
  "unknown"
 };

@ -205,6 +213,7 @@ char *gotoblas_corename(void) {
  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
  if (gotoblas == &gotoblas_CORTEXA55)    return corename[15];
  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[16];
+  if (gotoblas == &gotoblas_A64FX)        return corename[17];
  return corename[NUM_CORETYPES];
 }

@ -241,6 +250,7 @@ static gotoblas_t *force_coretype(char *coretype) {
    case 14: return (&gotoblas_THUNDERX3T110);
    case 15: return (&gotoblas_CORTEXA55);
    case 16: return (&gotoblas_ARMV8SVE);
+    case 17: return (&gotoblas_A64FX);
  }
  snprintf(message, 128, "Core not found: %s\n", coretype);
  openblas_warning(1, message);
@ -346,6 +356,15 @@ static gotoblas_t *get_coretype(void) {
          return &gotoblas_THUNDERX3T110;
      }
      break;
+    case 0x46: // Fujitsu
+      switch (part)
+      {
+#ifndef NO_SVE
+        case 0x001: // A64FX
+          return &gotoblas_A64FX;
+#endif
+      }
+      break;
    case 0x48: // HiSilicon
      switch (part)
      {
--- a/driver/others/dynamic_loongarch64.c
+++ b/driver/others/dynamic_loongarch64.c
@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/auxv.h>
 #include "common.h"

-extern gotoblas_t  gotoblas_LOONGSON3R5;
-extern gotoblas_t  gotoblas_LOONGSON2K1000;
-extern gotoblas_t  gotoblas_LOONGSONGENERIC;
+#define NUM_CORETYPES       6
+#define LOONGARCH_CFG0      0x00
+#define LA_HWCAP_LSX        (1U << 4)
+#define LA_HWCAP_LASX       (1U << 5)
+#define PRID_SERIES_MASK    0xf000
+#define PRID_SERIES_LA264   0xa000
+#define PRID_SERIES_LA364   0xb000
+#define PRID_SERIES_LA464   0xc000
+#define PRID_SERIES_LA664   0xd000
+
+extern gotoblas_t  gotoblas_LA64_GENERIC;
+extern gotoblas_t  gotoblas_LA264;
+extern gotoblas_t  gotoblas_LA464;

 extern void openblas_warning(int verbose, const char * msg);

-#define NUM_CORETYPES    3
-
 static char *corename[] = {
-  "loongson3r5",
-  "loongson2k1000",
+  "la64_generic",
+  "la264",
+  "la464",
  "loongsongeneric",
+  "loongson2k1000",
+  "loongson3r5",
  "unknown"
 };

 char *gotoblas_corename(void) {
-  if (gotoblas == &gotoblas_LOONGSON3R5)     return corename[0];
-  if (gotoblas == &gotoblas_LOONGSON2K1000)  return corename[1];
-  if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2];
+  if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0];
+  if (gotoblas == &gotoblas_LA264)        return corename[1];
+  if (gotoblas == &gotoblas_LA464)        return corename[2];
  return corename[NUM_CORETYPES];
 }

@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) {

  switch (found)
  {
-    case  0: return (&gotoblas_LOONGSON3R5);
-    case  1: return (&gotoblas_LOONGSON2K1000);
-    case  2: return (&gotoblas_LOONGSONGENERIC);
+    case  0: return (&gotoblas_LA64_GENERIC);
+    case  1: return (&gotoblas_LA264);
+    case  2: return (&gotoblas_LA464);
+    case  3: return (&gotoblas_LA64_GENERIC);
+    case  4: return (&gotoblas_LA264);
+    case  5: return (&gotoblas_LA464);
  }
  snprintf(message, 128, "Core not found: %s\n", coretype);
  openblas_warning(1, message);
  return NULL;
 }

-#define LA_HWCAP_LSX    (1U << 4)
-#define LA_HWCAP_LASX   (1U << 5)

-static gotoblas_t *get_coretype(void) {
-  int hwcap = (int)getauxval(AT_HWCAP);
+/* Detect whether the OS supports the LASX instruction set */
+static int os_support_lasx() {
+  int hwcap  = (int)getauxval(AT_HWCAP);

  if (hwcap & LA_HWCAP_LASX)
-    return &gotoblas_LOONGSON3R5;
-  else if (hwcap & LA_HWCAP_LSX)
-    return &gotoblas_LOONGSON2K1000;
+    return 1;
  else
-    return &gotoblas_LOONGSONGENERIC;
+    return 0;
+}
+
+/* Detect whether the OS supports the LSX instruction set */
+static int os_support_lsx() {
+  int hwcap  = (int)getauxval(AT_HWCAP);
+
+  if (hwcap & LA_HWCAP_LSX)
+    return 1;
+  else
+    return 0;
+}
+
+static uint32_t get_prid() {
+  uint32_t reg = 0;
+  __asm__ volatile (
+    "cpucfg %0, %1 \n\t"
+    : "+&r"(reg)
+    : "r"(LOONGARCH_CFG0)
+  );
+  return reg;
+}
+
+/* Select core at runtime based on the
+ * cpu name and SIMD instructions supported
+ * by the system
+ */
+static gotoblas_t *get_coretype(void) {
+  uint32_t prid = get_prid();
+  switch (prid & PRID_SERIES_MASK) {
+    case (PRID_SERIES_LA464):
+    case (PRID_SERIES_LA664):
+      if (os_support_lasx())
+        return &gotoblas_LA464;
+      else if (os_support_lsx())
+        return &gotoblas_LA264;
+      else
+        return &gotoblas_LA64_GENERIC;
+    break;
+
+    case (PRID_SERIES_LA264):
+    case (PRID_SERIES_LA364):
+      if (os_support_lsx())
+        return &gotoblas_LA264;
+      else
+        return &gotoblas_LA64_GENERIC;
+    break;
+
+    default:
+      return &gotoblas_LA64_GENERIC;
+    break;
+  }
 }

 void gotoblas_dynamic_init(void) {
--- a/driver/others/dynamic_riscv64.c
+++ b/driver/others/dynamic_riscv64.c
@ -0,0 +1,269 @@
+/*****************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#include <stdbool.h>
+
+#include "common.h"
+
+/*
+ * OpenBLAS contains some kernels that are optimised for RVV 1.0.  Before we
+ * can use these kernels we need to determine whether the device supports
+ * RVV 1.0 and what the device's VLEN is.  Our strategy will be as follows.
+ *
+ * First we'll invoke the hwprobe syscall to detect RVV 1.0.  In an ideal world,
+ * this is all we should need to do.  If the syscall is not implemented we
+ * should be able to deduce that RVV 1.0 is not supported (as it was added to
+ * Linux after hwprobe) and if the syscall is implemented we can use it to
+ * determine whether RVV 1.0 is supported.  However, there are some riscv64
+ * boards out there that implement RVV 1.0 but ship with a Linux kernel that
+ * predates RVV vector support and hwprobe support.  These kernels contain
+ * the backported RVV patches but not the hwprobe patches and so they
+ * advertise support for RVV via hwcap.  To cater for these boards we need
+ * to fall back to hwcap if hwprobe is not supported. Unfortunately, some
+ * boards indicate support for RVV via hwcap even though they only support
+ * RVV 0.7.1, which is incompatible with RVV 1.0.  So an additional check is
+ * required to test if the devices advertising support for RVV via hwcap really
+ * support RVV 1.0.  This test works by executing a vsetvli instruction that
+ * sets the tail agnostic and mask agnostic bits in the vtype register.
+ * These bits are not supported prior to RVV 0.9 so will cause the VIL bit to
+ * be set on the VTYPE register in CPUs supporting 0.7.1.  If this bit is set
+ * we can determine that RVV 1.0 is not supported.
+ *
+ * This approach is borrowed from
+ * VideoLan dav1d:
+ *   (https://code.videolan.org/videolan/dav1d/-/merge_requests/1629).
+ *
+ * We assume that if a kernel reports the presence of RVV via hwcap that
+ * the device supports the vsetvli instruction.
+ *
+ * For now we're just going to invoke the hwprobe syscall directly, rather than
+ * invoking it through glibc.  Support for hwprobe has been added to glibc but
+ * at the time of writing this support has not yet been included in a glibc
+ * release.  Once it has, it will be better to invoke hwprobe via glibc as doing
+ * so should take advantage of the vdso entry and be more efficient.
+ */
+
+/*
+ * This should work on Android as well but I have no way of testing.
+ */
+
+#if defined(OS_LINUX)
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+
+#define DETECT_RISCV64_HWCAP_ISA_V (1 << ('V' - 'A'))
+
+struct riscv_hwprobe {
+	int64_t key;
+	uint64_t value;
+};
+
+/* The constants below are copied from
+ * /usr/include/riscv64-linux-gnu/asm/hwprobe.h. We duplicate the
+ *  constants as the header file from which they are copied will only
+ *  be present if we're building on a device with Linux 6.5 or greater.
+ */
+
+#define RISCV_HWPROBE_KEY_IMA_EXT_0	4
+#define		RISCV_HWPROBE_IMA_V		(1 << 2)
+
+#ifndef NR_riscv_hwprobe
+#ifndef NR_arch_specific_syscall
+#define NR_arch_specific_syscall 244
+#endif
+#define NR_riscv_hwprobe (NR_arch_specific_syscall + 14)
+#endif
+#endif // defined(OS_LINUX)
+
+unsigned detect_riscv64_get_vlenb(void);
+uint64_t detect_riscv64_rvv100(void);
+
+extern gotoblas_t gotoblas_RISCV64_GENERIC;
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
+extern gotoblas_t gotoblas_RISCV64_ZVL256B;
+#endif
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
+extern gotoblas_t gotoblas_RISCV64_ZVL128B;
+#endif
+
+#define CPU_GENERIC         0
+#define CPU_RISCV64_ZVL256B 1
+#define CPU_RISCV64_ZVL128B 2
+
+static char *cpuname[] = {
+	"riscv64_generic",
+	"riscv64_zvl256b",
+	"riscv64_zvl128b"
+};
+#define NUM_CORETYPES (sizeof(cpuname)/sizeof(char*))
+
+extern int openblas_verbose(void);
+extern void openblas_warning(int verbose, const char* msg);
+
+char* gotoblas_corename(void) {
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
+	if (gotoblas == &gotoblas_RISCV64_ZVL256B)
+		return cpuname[CPU_RISCV64_ZVL256B];
+#endif
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
+	if (gotoblas == &gotoblas_RISCV64_ZVL128B)
+		return cpuname[CPU_RISCV64_ZVL128B];
+#endif
+	if (gotoblas == &gotoblas_RISCV64_GENERIC)
+		return cpuname[CPU_GENERIC];
+
+	return "unknown";
+}
+
+static gotoblas_t* get_coretype(void) {
+	unsigned vlenb = 0;
+
+#if !defined(OS_LINUX)
+	return NULL;
+#else
+
+	/*
+	 * See the hwprobe documentation
+	 *
+	 * ( https://docs.kernel.org/arch/riscv/hwprobe.html )
+	 * for more details.
+	 */
+
+	struct riscv_hwprobe pairs[] = {
+		{ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, },
+	};
+	int ret = syscall(NR_riscv_hwprobe, pairs, 1, 0, NULL, 0);
+	if (ret == 0) {
+		if (!(pairs[0].value & RISCV_HWPROBE_IMA_V))
+			return NULL;
+	} else {
+		if (!(getauxval(AT_HWCAP) & DETECT_RISCV64_HWCAP_ISA_V))
+			return NULL;
+
+		if (!detect_riscv64_rvv100())
+			return NULL;
+	}
+
+	/*
+	 * RVV 1.0 is supported.  We now just need to determine the coretype
+	 * based on the VLEN.
+	 */
+
+	vlenb = detect_riscv64_get_vlenb();
+
+	if (vlenb < 16)
+		return NULL;
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
+	if (vlenb >= 32)
+		return &gotoblas_RISCV64_ZVL256B;
+#endif
+
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
+	return &gotoblas_RISCV64_ZVL128B;
+#else
+	return NULL;
+#endif
+
+#endif  // !defined(OS_LINUX)
+}
+
+static gotoblas_t* force_coretype(char* coretype) {
+	size_t i;
+	char message[128];
+
+	for (i = 0; i < NUM_CORETYPES && strcasecmp(coretype, cpuname[i]); i++);
+
+	if (i == CPU_GENERIC)
+		return &gotoblas_RISCV64_GENERIC;
+
+	if (i == CPU_RISCV64_ZVL256B) {
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
+		return &gotoblas_RISCV64_ZVL256B;
+#else
+		openblas_warning(1,
+				 "riscv64_zvl256b support not compiled in\n");
+		return NULL;
+#endif
+	}
+
+	if (i == CPU_RISCV64_ZVL128B) {
+#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
+		return &gotoblas_RISCV64_ZVL128B;
+#else
+		openblas_warning(1,
+				 "riscv64_zvl128b support not compiled in\n");
+		return NULL;
+#endif
+	}
+
+	snprintf(message, sizeof(message), "Core not found: %s\n", coretype);
+	openblas_warning(1, message);
+
+	return NULL;
+}
+
+void gotoblas_dynamic_init(void) {
+
+	char coremsg[128];
+	char* p;
+
+	if (gotoblas) return;
+
+	p = getenv("OPENBLAS_CORETYPE");
+	if (p)
+		gotoblas = force_coretype(p);
+	else
+		gotoblas = get_coretype();
+
+	if (!gotoblas) {
+		snprintf(coremsg, sizeof(coremsg), "Falling back to generic riscv64 core\n");
+		openblas_warning(1, coremsg);
+		gotoblas = &gotoblas_RISCV64_GENERIC;
+	}
+
+	if (gotoblas->init) {
+		snprintf(coremsg, sizeof(coremsg), "Core: %s\n",
+			 gotoblas_corename());
+		openblas_warning(2, coremsg);
+		gotoblas->init();
+		return;
+	}
+
+	openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+	exit(1);
+}
+
+void gotoblas_dynamic_quit(void) {
+	gotoblas = NULL;
+}
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@ -964,7 +964,9 @@ static void *alloc_shm(void *address){
  return map_address;
 }

-#if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS
+#endif
+
+#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))

 static void alloc_hugetlb_free(struct alloc_t *alloc_info){

@ -1066,7 +1068,8 @@ static void *alloc_hugetlb(void *address){
 }
 #endif

-#endif
+
+

 #ifdef  ALLOC_HUGETLBFILE

@ -1165,11 +1168,10 @@ void *blas_memory_alloc(int procpos){
 #ifdef ALLOC_DEVICEDRIVER
    alloc_devicedirver,
 #endif
-/* Hugetlb implicitly assumes ALLOC_SHM */
-#ifdef ALLOC_SHM
+#ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB)
    alloc_shm,
 #endif
-#if ((defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
+#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
    alloc_hugetlb,
 #endif
 #ifdef ALLOC_MMAP
@ -1190,7 +1192,6 @@ void *blas_memory_alloc(int procpos){
  struct alloc_t * alloc_info;
  struct alloc_t ** alloc_table;

-
 #if defined(SMP) && !defined(USE_OPENMP)
 int mi;
 LOCK_COMMAND(&alloc_lock);
@ -1282,7 +1283,7 @@ UNLOCK_COMMAND(&alloc_lock);
        }
 #endif

-#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+#if (defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
 #endif

@ -2494,7 +2495,7 @@ static void *alloc_devicedirver(void *address){

 #endif

-#ifdef ALLOC_SHM
+#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)

 static void alloc_shm_free(struct release_t *release){

@ -2506,7 +2507,9 @@ static void alloc_shm_free(struct release_t *release){
 static void *alloc_shm(void *address){
  void *map_address;
  int shmid;
-
+#ifdef DEBUG
+ fprintf(stderr,"alloc_shm got called\n");
+#endif
  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);

  map_address = (void *)shmat(shmid, address, 0);
@ -2533,6 +2536,7 @@ static void *alloc_shm(void *address){

  return map_address;
 }
+#endif

 #if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS

@ -2562,6 +2566,10 @@ static void *alloc_hugetlb(void *address){

  void *map_address = (void *)-1;

+#ifdef DEBUG
+fprintf(stderr,"alloc_hugetlb got called\n");
+#endif
+
 #if defined(OS_LINUX) || defined(OS_AIX)
  int shmid;

@ -2583,7 +2591,7 @@ static void *alloc_hugetlb(void *address){

    if (map_address != (void *)-1){
      shmctl(shmid, IPC_RMID, 0);
-    }
+    }else printf("alloc_hugetlb failed\n");
  }
 #endif

@ -2645,7 +2653,6 @@ static void *alloc_hugetlb(void *address){
 }
 #endif

-#endif

 #ifdef  ALLOC_HUGETLBFILE

@ -2739,7 +2746,7 @@ struct newmemstruct
 };
 static volatile struct newmemstruct *newmemory;

-static int memory_initialized = 0;
+static volatile int memory_initialized = 0;
 static int memory_overflowed = 0;
 /*       Memory allocation routine           */
 /* procpos ... indicates where it comes from */
@ -2762,11 +2769,10 @@ void *blas_memory_alloc(int procpos){
 #ifdef ALLOC_DEVICEDRIVER
    alloc_devicedirver,
 #endif
-/* Hugetlb implicitly assumes ALLOC_SHM */
-#ifdef ALLOC_SHM
+#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
    alloc_shm,
 #endif
-#if ((defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
+#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
    alloc_hugetlb,
 #endif
 #ifdef ALLOC_MMAP
@ -2785,14 +2791,12 @@ void *blas_memory_alloc(int procpos){
  };
  void *(**func)(void *address);

-#if defined(USE_OPENMP)
  if (!memory_initialized) {
+#if defined(SMP) && !defined(USE_OPENMP)
+    LOCK_COMMAND(&alloc_lock);
+    if (!memory_initialized) {
 #endif

-  LOCK_COMMAND(&alloc_lock);
-
-  if (!memory_initialized) {
-
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
    for (position = 0; position < NUM_BUFFERS; position ++){
      memory[position].addr   = (void *)0;
@ -2821,12 +2825,12 @@ void *blas_memory_alloc(int procpos){
 #endif

    memory_initialized = 1;
-
+    WMB;
+#if defined(SMP) && !defined(USE_OPENMP)
  }
  UNLOCK_COMMAND(&alloc_lock);
-#if defined(USE_OPENMP)
-  }
 #endif
+}

 #ifdef DEBUG
  printf("Alloc Start ...\n");
@ -2945,8 +2949,22 @@ void *blas_memory_alloc(int procpos){
        }
 #endif

-#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+#if (defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
+#ifdef DEBUG
+	if (hugetlb_allocated) printf("allocating via shared memory with large page support (hugetlb)\n");
+#endif
+#endif
+
+#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+#ifdef DEBUG
+	printf("allocating via shared memory\n");
+#endif
+        if ((*func == alloc_shm) && (map_address == (void *)-1)) {
+#ifndef OS_WINDOWS
+            fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n");
+#endif
+	}
 #endif

        func ++;
@ -3061,10 +3079,23 @@ allocation2:
        }
 #endif

-#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+#if (defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+#ifdef DEBUG
+	fprintf(stderr,"OpenBLAS: allocating via shared memory with large page support (hugetlb)\n");
+#endif
        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
 #endif

+#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+#ifdef DEBUG
+	fprintf(stderr,"allocating via shared memory\n");
+#endif
+        if ((*func == alloc_shm) && (map_address == (void *)-1)) {
+#ifndef OS_WINDOWS
+            fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n");
+#endif
+	}
+#endif
        func ++;
      }

--- a/Show More
+++ b/Show More