Merge pull request #3488 from xianyi/develop

Update from develop branch for 0.3.19 release
This commit is contained in:
Martin Kroeker 2021-12-19 20:54:49 +01:00 committed by GitHub
commit 488911486a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
163 changed files with 24400 additions and 998 deletions

View File

@ -3,10 +3,13 @@
##
cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 19)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@ -20,51 +23,68 @@ endif()
#######
if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
set(NO_AFFINITY 1)
endif()
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
option(BUILD_STATIC_LIBS "Build static library" OFF)
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
endif()
if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC)
message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS")
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
#######
if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif()
if(BUILD_WITHOUT_CBLAS)
set(NO_CBLAS 1)
set(NO_CBLAS 1)
endif()
#######
if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
@ -98,7 +118,7 @@ endif ()
# set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all
# set(BUILD_BFLOAT16 true)
# set(BUILD_BFLOAT16 true)
set(BUILD_SINGLE true)
set(BUILD_DOUBLE true)
set(BUILD_COMPLEX true)
@ -143,9 +163,10 @@ endif ()
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
if(MSVC)
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
endif ()
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS})
@ -183,12 +204,61 @@ if (${DYNAMIC_ARCH})
endif ()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
if(NOT NO_LAPACK)
add_library(LAPACK OBJECT ${LA_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>")
endif()
if(NOT NO_LAPACKE)
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
endif()
if(BUILD_RELAPACK)
add_library(RELAPACK OBJECT ${RELA_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
endif()
set(OpenBLAS_LIBS "")
if(BUILD_STATIC_LIBS)
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static)
endif()
if(BUILD_SHARED_LIBS)
add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared)
endif()
if(BUILD_STATIC_LIBS)
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static)
else()
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared)
endif()
set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
# Android needs to explicitly link against libm
if(ANDROID)
target_link_libraries(${OpenBLAS_LIBNAME} m)
if(BUILD_STATIC_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_static m)
endif()
if(BUILD_SHARED_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_shared m)
endif()
endif()
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
if (NOT NOFORTRAN)
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
else ()
set (CMAKE_C_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
endif ()
endif()
# Handle MSVC exports
@ -197,21 +267,21 @@ if(MSVC AND BUILD_SHARED_LIBS)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else()
# Creates verbose .def file (51KB vs 18KB)
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
endif()
endif()
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
endforeach()
enable_testing()
@ -220,10 +290,17 @@ if (USE_THREAD)
# Add threading library to linker
find_package(Threads)
if (THREADS_HAVE_PTHREAD_ARG)
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread")
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
COMPILE_OPTIONS "-pthread"
INTERFACE_COMPILE_OPTIONS "-pthread"
)
endif()
if(BUILD_STATIC_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT})
endif()
if(BUILD_SHARED_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT})
endif()
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif()
#if (MSVC OR NOT NOFORTRAN)
@ -239,97 +316,109 @@ if (NOT NOFORTRAN)
add_subdirectory(ctest)
endif()
add_subdirectory(lapack-netlib/TESTING)
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
endif()
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
endif()
endif()
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
else()
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
if (NOT DEFINED BU)
set(BU _)
endif()
if (NOT DEFINED BU)
set(BU _)
endif()
if (NOT ${SYMBOLPREFIX} STREQUAL "")
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
if (NOT ${SYMBOLPREFIX} STREQUAL "")
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
endif()
# Install project
# Install libraries
install(TARGETS ${OpenBLAS_LIBNAME}
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
install(TARGETS ${OpenBLAS_LIBNAME}_shared
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
install(TARGETS ${OpenBLAS_LIBNAME}_static
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
else()
install(TARGETS ${OpenBLAS_LIBS}
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
endif()
# Install headers
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
@ -365,36 +454,41 @@ if(NOT NOFORTRAN)
endif()
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
if (NOT ${SYMBOLPREFIX} STREQUAL "")
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif()
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
if (NOT ${SYMBOLPREFIX} STREQUAL "")
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif()
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT NO_LAPACKE)
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
if(BUILD_STATIC_LIBS)
add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke)
endif()
if(BUILD_SHARED_LIBS)
add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke)
endif()
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
endif()
# Install pkg-config files
@ -419,4 +513,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@ -197,3 +197,7 @@ In chronological order:
* River Dillon <oss@outerpassage.net>
* [2021-07-10] fix compilation with musl libc
* Bine Brank <https://github.com/binebrank>
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM

View File

@ -1,4 +1,51 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.19
19-Dec-2021
general:
- reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16
- fixed a potential thread race in the thread buffer reallocation routines
that were introduced in 0.3.18
- fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE
- fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG
- made automatic library suffix for CMAKE builds with INTERFACE64 available
to CBLAS-only builds
x86_64:
- DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities
when an unknown CPUID is encountered, instead of defaulting to Prescott
- added cpu detection for Intel Alder Lake
- added cpu detection for Intel Sapphire Rapids
- added an optimized SBGEMM kernel for Sapphire Rapids
- fixed DYNAMIC_ARCH builds on OSX with CMAKE
- worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX
- fixed missing thread initialization for static builds on Windows/MSVC
- fixed an excessive read in ZSYMV
POWER:
- added support for POWER10 in big-endian mode
- added support for building with CMAKE
- added optimized SGEMM and DGEMM kernels for small matrix sizes
ARMV8:
- added basic support and cputype detection for Fujitsu A64FX
- added a generic ARMV8SVE target
- added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX
- added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus
- fixed cpuid detection for Apple M1 and improved performance
- improved compiler flag setting in CMAKE builds
RISCV64:
- fixed improper initialization in CSCAL/ZSCAL for strided access patterns
MIPS:
- added a GENERIC target for MIPS32
- added support for cross-compiling to MIPS32 on x86_64 using CMAKE
MIPS64:
- fixed misdetection of MSA capability
====================================================================
Version 0.3.18
02-Oct-2021

View File

@ -32,7 +32,7 @@ export NOFORTRAN
export NO_LAPACK
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test

View File

@ -1,6 +1,9 @@
ifneq ($(C_COMPILER), PGI)
ifneq ($(GCCVERSIONGT4), 1)
ifeq ($(C_COMPILER), CLANG)
ISCLANG=1
endif
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a
@ -17,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a
endif
endif
ifeq ($(CORE), ARMV8SVE)
CCOMMON_OPT += -march=armv8-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a+sve
endif
endif
ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG)
@ -48,7 +58,7 @@ endif
# Use a72 tunings because Neoverse-N1 is only available
# in GCC>=9
ifeq ($(CORE), NEOVERSEN1)
ifeq ($(GCCVERSIONGTEQ7), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ9), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
ifneq ($(F_COMPILER), NAG)
@ -70,7 +80,7 @@ endif
# Use a53 tunings because a55 is only available in GCC>=8.1
ifeq ($(CORE), CORTEXA55)
ifeq ($(GCCVERSIONGTEQ7), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ8), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
ifneq ($(F_COMPILER), NAG)
@ -132,7 +142,7 @@ FCOMMON_OPT += -march=armv8.3-a
endif
endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
ifneq ($(F_COMPILER), NAG)
@ -150,6 +160,15 @@ endif
endif
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), A64FX)
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
endif
endif
endif
endif
endif

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.18
VERSION = 0.3.18.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -9,11 +9,10 @@ ifndef TOPDIR
TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture for getarch compile options.
ifndef ARCH
# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
HOSTARCH := $(shell uname -m)
else
HOSTARCH = $(ARCH)
ifeq ($(HOSTARCH), amd64)
HOSTARCH=x86_64
endif
# Catch conflicting usage of ARCH in some BSD environments
@ -102,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET
ifeq ($(TARGET), GENERIC)
ifeq ($(DYNAMIC_ARCH), 1)
override NO_EXPRECISION=1
export NO_EXPRECiSION
export NO_EXPRECISION
endif
endif
endif
@ -119,6 +118,9 @@ endif
ifeq ($(TARGET), COOPERLAKE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SAPPHIRERAPIDS)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@ -143,8 +145,13 @@ endif
ifeq ($(TARGET), POWER8)
GETARCH_FLAGS := -DFORCE_POWER6
endif
ifeq ($(TARGET), POWER9)
GETARCH_FLAGS := -DFORCE_POWER6
endif
ifeq ($(TARGET), POWER10)
GETARCH_FLAGS := -DFORCE_POWER6
endif
endif
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
#
@ -164,6 +171,9 @@ endif
ifeq ($(TARGET_CORE), COOPERLAKE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@ -251,6 +261,8 @@ endif
#For small matrix optimization
ifeq ($(ARCH), x86_64)
SMALL_MATRIX_OPT = 1
else ifeq ($(CORE), POWER10)
SMALL_MATRIX_OPT = 1
endif
ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT
@ -260,6 +272,10 @@ endif
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
# Determine if the assembler is GNU Assembler
HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
# Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
@ -307,7 +323,7 @@ else
SMP = 1
endif
else
ifeq ($(NUM_THREAD), 1)
ifeq ($(NUM_THREADS), 1)
SMP =
else
SMP = 1
@ -892,15 +908,25 @@ endif
ifeq ($(C_COMPILER), PGI)
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
NEWPGI := 1
PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
NEWPGI2 := 1
endif
endif
ifdef BINARY64
ifeq ($(ARCH), x86_64)
ifneq ($(NEWPGI2),1)
CCOMMON_OPT += -tp p7-64
else
CCOMMON_OPT += -tp px
endif
ifneq ($(NEWPGI),1)
CCOMMON_OPT += -D__MMX__ -Mnollvm
endif
@ -915,7 +941,11 @@ endif
endif
endif
else
ifneq ($(NEWPGI2),1)
CCOMMON_OPT += -tp p7
else
CCOMMON_OPT += -tp px
endif
endif
endif
@ -1092,8 +1122,12 @@ FCOMMON_OPT += -i8
endif
endif
ifeq ($(ARCH), x86_64)
ifneq ($(NEWPGI2),1)
FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp px
endif
else
ifeq ($(ARCH), power)
ifeq ($(CORE), POWER6)
$(warning NVIDIA HPC compilers do not support POWER6.)
@ -1643,8 +1677,10 @@ export HAVE_VFP
export HAVE_VFPV3
export HAVE_VFPV4
export HAVE_NEON
export HAVE_MSA
export MSA_FLAGS
ifndef NO_MSA
export HAVE_MSA
export MSA_FLAGS
endif
export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE

View File

@ -81,6 +81,40 @@ CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
ifeq ($(CORE), SAPPHIRERAPIDS)
ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC)
# sapphire rapids support was added in 11
ifeq ($(GCCVERSIONGTEQ11), 1)
CCOMMON_OPT += -march=sapphirerapids
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=sapphirerapids
endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)

View File

@ -23,6 +23,7 @@ HASWELL
SKYLAKEX
ATOM
COOPERLAKE
SAPPHIRERAPIDS
b)AMD CPU:
ATHLON

View File

@ -29,15 +29,15 @@ environment:
global:
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
matrix:
- COMPILER: clang-cl
WITH_FORTRAN: ON
- COMPILER: clang-cl
DYNAMIC_ARCH: ON
WITH_FORTRAN: OFF
- COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
# - COMPILER: clang-cl
# WITH_FORTRAN: ON
# - COMPILER: clang-cl
# DYNAMIC_ARCH: ON
# WITH_FORTRAN: OFF
# - COMPILER: cl
# - COMPILER: MinGW64-gcc-7.2.0-mingw
# DYNAMIC_ARCH: OFF
# WITH_FORTRAN: ignore
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-6.3.0-32
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
@ -46,6 +46,7 @@ environment:
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda update --yes -n base conda
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
@ -64,8 +65,8 @@ before_build:
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..

View File

@ -76,6 +76,49 @@ jobs:
dir
openblas_utest.exe
- job: Windows_mingw_gmake
pool:
vmImage: 'windows-latest'
steps:
- script: |
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL"
- job: Windows_clang_cmake
pool:
vmImage: 'windows-latest'
steps:
- script: |
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
set "LIB=C:\Miniconda\Library\lib;%LIB%"
set "CPATH=C:\Miniconda\Library\include;%CPATH%
conda config --add channels conda-forge --force
conda config --set auto_update_conda false
conda install --yes ninja
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
mkdir build
cd build
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
- job: Windows_flang_clang
pool:
vmImage: 'windows-latest'
steps:
- script: |
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
set "LIB=C:\Miniconda\Library\lib;%LIB%"
set "CPATH=C:\Miniconda\Library\include;%CPATH%"
conda config --add channels conda-forge --force
conda config --set auto_update_conda false
conda install --yes --quiet ninja flang
mkdir build
cd build
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
- job: OSX_OpenMP
pool:
vmImage: 'macOS-10.15'
@ -122,7 +165,7 @@ jobs:
make
ctest
- job: OSX_OpenMP_Clang_gf_cmake
- job: OSX_dynarch_cmake
pool:
vmImage: 'macOS-10.15'
variables:
@ -130,12 +173,10 @@ jobs:
LIBRARY_PATH: /usr/local/opt/llvm/lib
steps:
- script: |
brew update
brew install llvm libomp
mkdir build
cd build
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 ..
make
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
cmake --build .
ctest
- job: OSX_Ifort_Clang
@ -179,7 +220,7 @@ jobs:
brew update
brew install --cask android-ndk
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
- job: OSX_IOS_ARMV8
pool:
@ -206,9 +247,9 @@ jobs:
vmImage: 'ubuntu-latest'
steps:
- script: |
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \
|| exit 1
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
&& echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \
|| exit 1
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
alpine make DYNAMIC_ARCH=1 BINARY=64

View File

@ -125,7 +125,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

View File

@ -109,7 +109,7 @@ if (${ARCH} STREQUAL "ia64")
endif ()
endif ()
if (MIPS64)
if (MIPS32 OR MIPS64)
set(NO_BINARY_MODE 1)
endif ()

View File

@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
if (NO_BINARY_MODE)
if (MIPS32)
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32")
set(BINARY_DEFINED 1)
endif ()
if (MIPS64)
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64")
@ -126,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE)
endif ()
endif ()
if (${CORE} STREQUAL SAPPHIRERAPIDS)
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
endif()
endif ()
endif ()
endif ()
if (${CORE} STREQUAL A64FX)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif ()
if (${CORE} STREQUAL ARMV8SVE)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL POWER10)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
else ()
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." )
endif()
endif ()
endif ()
if (${CORE} STREQUAL POWER9)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.")
endif ()
endif ()
endif ()
if (${CORE} STREQUAL POWER8)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
endif ()
endif ()
if (NOT DYNAMIC_ARCH)
if (HAVE_AVX2)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")

View File

@ -3,11 +3,6 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)

View File

@ -1,214 +1,218 @@
# helper functions for the kernel CMakeLists.txt
function(SetFallback KERNEL SOURCE_PATH)
if (NOT (DEFINED ${KERNEL}))
set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE)
endif ()
endfunction()
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)
set(QAMAXKERNEL amax.S)
set(CAMAXKERNEL zamax.S)
set(ZAMAXKERNEL zamax.S)
set(XAMAXKERNEL zamax.S)
set(SAMINKERNEL amin.S)
set(DAMINKERNEL amin.S)
set(QAMINKERNEL amin.S)
set(CAMINKERNEL zamin.S)
set(ZAMINKERNEL zamin.S)
set(XAMINKERNEL zamin.S)
set(SMAXKERNEL max.S)
set(DMAXKERNEL max.S)
set(QMAXKERNEL max.S)
set(SMINKERNEL min.S)
set(DMINKERNEL min.S)
set(QMINKERNEL min.S)
set(ISAMAXKERNEL iamax.S)
set(IDAMAXKERNEL iamax.S)
set(IQAMAXKERNEL iamax.S)
set(ICAMAXKERNEL izamax.S)
set(IZAMAXKERNEL izamax.S)
set(IXAMAXKERNEL izamax.S)
set(ISAMINKERNEL iamin.S)
set(IDAMINKERNEL iamin.S)
set(IQAMINKERNEL iamin.S)
set(ICAMINKERNEL izamin.S)
set(IZAMINKERNEL izamin.S)
set(IXAMINKERNEL izamin.S)
set(ISMAXKERNEL iamax.S)
set(IDMAXKERNEL iamax.S)
set(IQMAXKERNEL iamax.S)
set(ISMINKERNEL iamin.S)
set(IDMINKERNEL iamin.S)
set(IQMINKERNEL iamin.S)
set(SASUMKERNEL asum.S)
set(DASUMKERNEL asum.S)
set(CASUMKERNEL zasum.S)
set(ZASUMKERNEL zasum.S)
set(QASUMKERNEL asum.S)
set(XASUMKERNEL zasum.S)
set(SAXPYKERNEL axpy.S)
set(DAXPYKERNEL axpy.S)
set(CAXPYKERNEL zaxpy.S)
set(ZAXPYKERNEL zaxpy.S)
set(QAXPYKERNEL axpy.S)
set(XAXPYKERNEL zaxpy.S)
set(SCOPYKERNEL copy.S)
set(DCOPYKERNEL copy.S)
set(CCOPYKERNEL zcopy.S)
set(ZCOPYKERNEL zcopy.S)
set(QCOPYKERNEL copy.S)
set(XCOPYKERNEL zcopy.S)
set(SDOTKERNEL dot.S)
set(DDOTKERNEL dot.S)
set(CDOTKERNEL zdot.S)
set(ZDOTKERNEL zdot.S)
set(QDOTKERNEL dot.S)
set(XDOTKERNEL zdot.S)
set(SNRM2KERNEL nrm2.S)
set(DNRM2KERNEL nrm2.S)
set(QNRM2KERNEL nrm2.S)
set(CNRM2KERNEL znrm2.S)
set(ZNRM2KERNEL znrm2.S)
set(XNRM2KERNEL znrm2.S)
set(SROTKERNEL rot.S)
set(DROTKERNEL rot.S)
set(QROTKERNEL rot.S)
set(CROTKERNEL zrot.S)
set(ZROTKERNEL zrot.S)
set(XROTKERNEL zrot.S)
set(SSCALKERNEL scal.S)
set(DSCALKERNEL scal.S)
set(CSCALKERNEL zscal.S)
set(ZSCALKERNEL zscal.S)
set(QSCALKERNEL scal.S)
set(XSCALKERNEL zscal.S)
set(SSWAPKERNEL swap.S)
set(DSWAPKERNEL swap.S)
set(CSWAPKERNEL zswap.S)
set(ZSWAPKERNEL zswap.S)
set(QSWAPKERNEL swap.S)
set(XSWAPKERNEL zswap.S)
set(SGEMVNKERNEL gemv_n.S)
set(SGEMVTKERNEL gemv_t.S)
set(DGEMVNKERNEL gemv_n.S)
set(DGEMVTKERNEL gemv_t.S)
set(CGEMVNKERNEL zgemv_n.S)
set(CGEMVTKERNEL zgemv_t.S)
set(ZGEMVNKERNEL zgemv_n.S)
set(ZGEMVTKERNEL zgemv_t.S)
set(QGEMVNKERNEL gemv_n.S)
set(QGEMVTKERNEL gemv_t.S)
set(XGEMVNKERNEL zgemv_n.S)
set(XGEMVTKERNEL zgemv_t.S)
set(SCABS_KERNEL ../generic/cabs.c)
set(DCABS_KERNEL ../generic/cabs.c)
set(QCABS_KERNEL ../generic/cabs.c)
set(LSAME_KERNEL ../generic/lsame.c)
set(SAXPBYKERNEL ../arm/axpby.c)
set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
SetFallback(SAMAXKERNEL amax.S)
SetFallback(DAMAXKERNEL amax.S)
SetFallback(QAMAXKERNEL amax.S)
SetFallback(CAMAXKERNEL zamax.S)
SetFallback(ZAMAXKERNEL zamax.S)
SetFallback(XAMAXKERNEL zamax.S)
SetFallback(SAMINKERNEL amin.S)
SetFallback(DAMINKERNEL amin.S)
SetFallback(QAMINKERNEL amin.S)
SetFallback(CAMINKERNEL zamin.S)
SetFallback(ZAMINKERNEL zamin.S)
SetFallback(XAMINKERNEL zamin.S)
SetFallback(SMAXKERNEL max.S)
SetFallback(DMAXKERNEL max.S)
SetFallback(QMAXKERNEL max.S)
SetFallback(SMINKERNEL min.S)
SetFallback(DMINKERNEL min.S)
SetFallback(QMINKERNEL min.S)
SetFallback(ISAMAXKERNEL iamax.S)
SetFallback(IDAMAXKERNEL iamax.S)
SetFallback(IQAMAXKERNEL iamax.S)
SetFallback(ICAMAXKERNEL izamax.S)
SetFallback(IZAMAXKERNEL izamax.S)
SetFallback(IXAMAXKERNEL izamax.S)
SetFallback(ISAMINKERNEL iamin.S)
SetFallback(IDAMINKERNEL iamin.S)
SetFallback(IQAMINKERNEL iamin.S)
SetFallback(ICAMINKERNEL izamin.S)
SetFallback(IZAMINKERNEL izamin.S)
SetFallback(IXAMINKERNEL izamin.S)
SetFallback(ISMAXKERNEL iamax.S)
SetFallback(IDMAXKERNEL iamax.S)
SetFallback(IQMAXKERNEL iamax.S)
SetFallback(ISMINKERNEL iamin.S)
SetFallback(IDMINKERNEL iamin.S)
SetFallback(IQMINKERNEL iamin.S)
SetFallback(SASUMKERNEL asum.S)
SetFallback(DASUMKERNEL asum.S)
SetFallback(CASUMKERNEL zasum.S)
SetFallback(ZASUMKERNEL zasum.S)
SetFallback(QASUMKERNEL asum.S)
SetFallback(XASUMKERNEL zasum.S)
SetFallback(SAXPYKERNEL axpy.S)
SetFallback(DAXPYKERNEL axpy.S)
SetFallback(CAXPYKERNEL zaxpy.S)
SetFallback(ZAXPYKERNEL zaxpy.S)
SetFallback(QAXPYKERNEL axpy.S)
SetFallback(XAXPYKERNEL zaxpy.S)
SetFallback(SCOPYKERNEL copy.S)
SetFallback(DCOPYKERNEL copy.S)
SetFallback(CCOPYKERNEL zcopy.S)
SetFallback(ZCOPYKERNEL zcopy.S)
SetFallback(QCOPYKERNEL copy.S)
SetFallback(XCOPYKERNEL zcopy.S)
SetFallback(SDOTKERNEL dot.S)
SetFallback(DDOTKERNEL dot.S)
SetFallback(CDOTKERNEL zdot.S)
SetFallback(ZDOTKERNEL zdot.S)
SetFallback(QDOTKERNEL dot.S)
SetFallback(XDOTKERNEL zdot.S)
SetFallback(SNRM2KERNEL nrm2.S)
SetFallback(DNRM2KERNEL nrm2.S)
SetFallback(QNRM2KERNEL nrm2.S)
SetFallback(CNRM2KERNEL znrm2.S)
SetFallback(ZNRM2KERNEL znrm2.S)
SetFallback(XNRM2KERNEL znrm2.S)
SetFallback(SROTKERNEL rot.S)
SetFallback(DROTKERNEL rot.S)
SetFallback(QROTKERNEL rot.S)
SetFallback(CROTKERNEL zrot.S)
SetFallback(ZROTKERNEL zrot.S)
SetFallback(XROTKERNEL zrot.S)
SetFallback(SSCALKERNEL scal.S)
SetFallback(DSCALKERNEL scal.S)
SetFallback(CSCALKERNEL zscal.S)
SetFallback(ZSCALKERNEL zscal.S)
SetFallback(QSCALKERNEL scal.S)
SetFallback(XSCALKERNEL zscal.S)
SetFallback(SSWAPKERNEL swap.S)
SetFallback(DSWAPKERNEL swap.S)
SetFallback(CSWAPKERNEL zswap.S)
SetFallback(ZSWAPKERNEL zswap.S)
SetFallback(QSWAPKERNEL swap.S)
SetFallback(XSWAPKERNEL zswap.S)
SetFallback(SGEMVNKERNEL gemv_n.S)
SetFallback(SGEMVTKERNEL gemv_t.S)
SetFallback(DGEMVNKERNEL gemv_n.S)
SetFallback(DGEMVTKERNEL gemv_t.S)
SetFallback(CGEMVNKERNEL zgemv_n.S)
SetFallback(CGEMVTKERNEL zgemv_t.S)
SetFallback(ZGEMVNKERNEL zgemv_n.S)
SetFallback(ZGEMVTKERNEL zgemv_t.S)
SetFallback(QGEMVNKERNEL gemv_n.S)
SetFallback(QGEMVTKERNEL gemv_t.S)
SetFallback(XGEMVNKERNEL zgemv_n.S)
SetFallback(XGEMVTKERNEL zgemv_t.S)
SetFallback(SCABS_KERNEL ../generic/cabs.c)
SetFallback(DCABS_KERNEL ../generic/cabs.c)
SetFallback(QCABS_KERNEL ../generic/cabs.c)
SetFallback(LSAME_KERNEL ../generic/lsame.c)
SetFallback(SAXPBYKERNEL ../arm/axpby.c)
SetFallback(DAXPBYKERNEL ../arm/axpby.c)
SetFallback(CAXPBYKERNEL ../arm/zaxpby.c)
SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c)
SetFallback(SSUMKERNEL sum.S)
SetFallback(DSUMKERNEL sum.S)
SetFallback(CSUMKERNEL zsum.S)
SetFallback(ZSUMKERNEL zsum.S)
SetFallback(QSUMKERNEL sum.S)
SetFallback(XSUMKERNEL zsum.S)
if (BUILD_BFLOAT16)
set(SHAMINKERNEL ../arm/amin.c)
set(SHAMAXKERNEL ../arm/amax.c)
set(SHMAXKERNEL ../arm/max.c)
set(SHMINKERNEL ../arm/min.c)
set(ISHAMAXKERNEL ../arm/iamax.c)
set(ISHAMINKERNEL ../arm/iamin.c)
set(ISHMAXKERNEL ../arm/imax.c)
set(ISHMINKERNEL ../arm/imin.c)
set(SHASUMKERNEL ../arm/asum.c)
set(SHAXPYKERNEL ../arm/axpy.c)
set(SHAXPBYKERNEL ../arm/axpby.c)
set(SHCOPYKERNEL ../arm/copy.c)
set(SBDOTKERNEL ../x86_64/sbdot.c)
set(SHROTKERNEL ../arm/rot.c)
set(SHSCALKERNEL ../arm/scal.c)
set(SHNRM2KERNEL ../arm/nrm2.c)
set(SHSUMKERNEL ../arm/sum.c)
set(SHSWAPKERNEL ../arm/swap.c)
set(TOBF16KERNEL ../x86_64/tobf16.c)
set(BF16TOKERNEL ../x86_64/bf16to.c)
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
SetFallback(SHAMINKERNEL ../arm/amin.c)
SetFallback(SHAMAXKERNEL ../arm/amax.c)
SetFallback(SHMAXKERNEL ../arm/max.c)
SetFallback(SHMINKERNEL ../arm/min.c)
SetFallback(ISHAMAXKERNEL ../arm/iamax.c)
SetFallback(ISHAMINKERNEL ../arm/iamin.c)
SetFallback(ISHMAXKERNEL ../arm/imax.c)
SetFallback(ISHMINKERNEL ../arm/imin.c)
SetFallback(SHASUMKERNEL ../arm/asum.c)
SetFallback(SHAXPYKERNEL ../arm/axpy.c)
SetFallback(SHAXPBYKERNEL ../arm/axpby.c)
SetFallback(SHCOPYKERNEL ../arm/copy.c)
SetFallback(SBDOTKERNEL ../x86_64/sbdot.c)
SetFallback(SHROTKERNEL ../arm/rot.c)
SetFallback(SHSCALKERNEL ../arm/scal.c)
SetFallback(SHNRM2KERNEL ../arm/nrm2.c)
SetFallback(SHSUMKERNEL ../arm/sum.c)
SetFallback(SHSWAPKERNEL ../arm/swap.c)
SetFallback(TOBF16KERNEL ../x86_64/tobf16.c)
SetFallback(BF16TOKERNEL ../x86_64/bf16to.c)
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
endif ()
endmacro ()
macro(SetDefaultL2)
set(SGEMVNKERNEL ../arm/gemv_n.c)
set(SGEMVTKERNEL ../arm/gemv_t.c)
set(DGEMVNKERNEL gemv_n.S)
set(DGEMVTKERNEL gemv_t.S)
set(CGEMVNKERNEL zgemv_n.S)
set(CGEMVTKERNEL zgemv_t.S)
set(ZGEMVNKERNEL zgemv_n.S)
set(ZGEMVTKERNEL zgemv_t.S)
set(QGEMVNKERNEL gemv_n.S)
set(QGEMVTKERNEL gemv_t.S)
set(XGEMVNKERNEL zgemv_n.S)
set(XGEMVTKERNEL zgemv_t.S)
set(SGERKERNEL ../generic/ger.c)
set(DGERKERNEL ../generic/ger.c)
set(QGERKERNEL ../generic/ger.c)
set(CGERUKERNEL ../generic/zger.c)
set(CGERCKERNEL ../generic/zger.c)
set(ZGERUKERNEL ../generic/zger.c)
set(ZGERCKERNEL ../generic/zger.c)
set(XGERUKERNEL ../generic/zger.c)
set(XGERCKERNEL ../generic/zger.c)
set(SSYMV_U_KERNEL ../generic/symv_k.c)
set(SSYMV_L_KERNEL ../generic/symv_k.c)
set(DSYMV_U_KERNEL ../generic/symv_k.c)
set(DSYMV_L_KERNEL ../generic/symv_k.c)
set(QSYMV_U_KERNEL ../generic/symv_k.c)
set(QSYMV_L_KERNEL ../generic/symv_k.c)
set(CSYMV_U_KERNEL ../generic/zsymv_k.c)
set(CSYMV_L_KERNEL ../generic/zsymv_k.c)
set(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
set(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
set(XSYMV_U_KERNEL ../generic/zsymv_k.c)
set(XSYMV_L_KERNEL ../generic/zsymv_k.c)
set(CHEMV_U_KERNEL ../generic/zhemv_k.c)
set(CHEMV_L_KERNEL ../generic/zhemv_k.c)
set(CHEMV_V_KERNEL ../generic/zhemv_k.c)
set(CHEMV_M_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
set(XHEMV_U_KERNEL ../generic/zhemv_k.c)
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
SetFallback(SGEMVNKERNEL ../arm/gemv_n.c)
SetFallback(SGEMVTKERNEL ../arm/gemv_t.c)
SetFallback(DGEMVNKERNEL gemv_n.S)
SetFallback(DGEMVTKERNEL gemv_t.S)
SetFallback(CGEMVNKERNEL zgemv_n.S)
SetFallback(CGEMVTKERNEL zgemv_t.S)
SetFallback(ZGEMVNKERNEL zgemv_n.S)
SetFallback(ZGEMVTKERNEL zgemv_t.S)
SetFallback(QGEMVNKERNEL gemv_n.S)
SetFallback(QGEMVTKERNEL gemv_t.S)
SetFallback(XGEMVNKERNEL zgemv_n.S)
SetFallback(XGEMVTKERNEL zgemv_t.S)
SetFallback(SGERKERNEL ../generic/ger.c)
SetFallback(DGERKERNEL ../generic/ger.c)
SetFallback(QGERKERNEL ../generic/ger.c)
SetFallback(CGERUKERNEL ../generic/zger.c)
SetFallback(CGERCKERNEL ../generic/zger.c)
SetFallback(ZGERUKERNEL ../generic/zger.c)
SetFallback(ZGERCKERNEL ../generic/zger.c)
SetFallback(XGERUKERNEL ../generic/zger.c)
SetFallback(XGERCKERNEL ../generic/zger.c)
SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c)
SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c)
SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c)
SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c)
SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c)
SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c)
SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c)
SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c)
SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c)
SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c)
SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c)
SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c)
SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c)
SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c)
SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c)
SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c)
SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c)
SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c)
if (BUILD_BFLOAT16)
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
set(SHGERKERNEL ../generic/ger.c)
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
SetFallback(SHGERKERNEL ../generic/ger.c)
endif ()
endmacro ()
macro(SetDefaultL3)
set(SGEADD_KERNEL ../generic/geadd.c)
set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c)
SetFallback(SGEADD_KERNEL ../generic/geadd.c)
SetFallback(DGEADD_KERNEL ../generic/geadd.c)
SetFallback(CGEADD_KERNEL ../generic/zgeadd.c)
SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c)
if (BUILD_BFLOAT16)
set(SHGEADD_KERNEL ../generic/geadd.c)
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
set(SBGEMM_BETA ../generic/gemm_beta.c)
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
SetFallback(SHGEADD_KERNEL ../generic/geadd.c)
SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
SetFallback(SBGEMM_BETA ../generic/gemm_beta.c)
SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o)
SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
endif ()
endmacro ()

View File

@ -416,7 +416,7 @@ endif ()
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "VORTEX")
elseif ("${TCORE}" STREQUAL "VORTEX")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t32768\n"
@ -439,6 +439,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX")
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "P5600")
file(APPEND ${TARGET_CONF_TEMP}
"#define L2_SIZE 1048576\n"
"#define DTB_SIZE 4096\n"
"#define DTB_DEFAULT_ENTRIES 64\n")
set(SGEMM_UNROLL_M 2)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" MATCHES "MIPS")
file(APPEND ${TARGET_CONF_TEMP}
"#define L2_SIZE 262144\n"
"#define DTB_SIZE 4096\n"
"#define DTB_DEFAULT_ENTRIES 64\n")
set(SGEMM_UNROLL_M 2)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"

View File

@ -33,7 +33,7 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE")
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS")
set(TARGET "NEHALEM")
endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
@ -42,6 +42,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
set(TARGET "ARMV7")
endif ()
if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10")
set(TARGET "POWER6")
endif ()
endif ()
@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -static")
endif ()
if (POWER)
set(NO_WARMUP 1)
set(HAVE_GAS 1)
if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU")
set(HAVE_GAS 0)
elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as")
set(HAVE_GAS 0)
endif ()
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}")
endif ()
#if don't use Fortran, it will only compile CBLAS.
if (ONLY_CBLAS)
set(NO_LAPACK 1)
@ -163,6 +178,22 @@ if (DEFINED TARGET)
endif()
endif()
endif()
if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
endif()
endif()
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
@ -206,6 +237,27 @@ if (DEFINED TARGET)
if (DEFINED HAVE_SSE4_1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
endif()
if (${TARGET} STREQUAL POWER10)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
else ()
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.")
endif()
endif()
if (${TARGET} STREQUAL POWER9)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
else ()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.")
endif()
endif()
if (${TARGET} STREQUAL POWER8)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
endif()
endif()
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
@ -223,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
# C Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
@ -258,7 +315,7 @@ if (NEED_PIC)
endif()
endif ()
if (X86_64)
if (X86_64 OR ${CORE} STREQUAL POWER10)
set(SMALL_MATRIX_OPT TRUE)
endif ()
if (SMALL_MATRIX_OPT)
@ -266,7 +323,7 @@ if (SMALL_MATRIX_OPT)
endif ()
if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR PPC)
if (X86 OR X86_64 OR ARM64 OR POWER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")

View File

@ -20,11 +20,11 @@ endif()
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
if(MINGW)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
set(MINGW64 1)
endif()
endif()
@ -35,7 +35,7 @@ if(CMAKE_CL_64 OR MINGW64)
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
set(PPC 1)
set(POWER 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
@ -73,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING})
else ()
set(X86 1)
endif()
elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*")
set(MIPS32 1)
elseif (${TARGET} STREQUAL "ARMV7")
set(ARM 1)
else()
@ -86,8 +88,12 @@ if (X86_64)
set(ARCH "x86_64")
elseif(X86)
set(ARCH "x86")
elseif(PPC)
elseif(POWER)
set(ARCH "power")
elseif(MIPS32)
set(ARCH "mips")
elseif(MIPS64)
set(ARCH "mips64")
elseif(ARM)
set(ARCH "arm")
elseif(ARM64)
@ -97,7 +103,7 @@ else()
endif ()
if (NOT BINARY)
if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64)
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
set(BINARY 64)
else ()
set(BINARY 32)

View File

@ -15,35 +15,83 @@ endfunction ()
# Reads a Makefile into CMake vars.
macro(ParseMakefileVars MAKEFILE_IN)
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
set (IfElse 0)
set (ElseSeen 0)
set (C_COMPILER ${CMAKE_C_COMPILER_ID})
set (IfElse 0)
set (ElseSeen 0)
set (SkipIfs 0)
set (SkipElse 0)
file(STRINGS ${MAKEFILE_IN} makefile_contents)
foreach (makefile_line ${makefile_contents})
#message(STATUS "parsing ${makefile_line}")
if (${IfElse} GREATER 0)
#message(STATUS "parsing ${makefile_line}")
# Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition.
# The variable SkipIfs is used to identify which endif statement closes the scope of the else statement.
if (${SkipElse} EQUAL 1)
#message(STATUS "skipping ${makefile_line}")
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
MATH(EXPR SkipIfs "${SkipIfs}+1")
endif ()
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "ENDIF ${makefile_line}")
set (IfElse 0)
set (ElseSeen 0)
if (${SkipIfs} EQUAL 0)
set (SkipElse 0)
else ()
MATH(EXPR SkipIfs "${SkipIfs}-1")
endif ()
endif ()
continue ()
endif ()
# The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement.
if (${IfElse} GREATER 0)
# If the current scope is the one that has to be skipped, the if/endif/else statements
# along with it till the endif that closes the current scope have to be ignored as well.
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
#message(STATUS "skipping ${makefile_line}")
MATH(EXPR SkipIfs "${SkipIfs}+1")
continue ()
endif ()
endif ()
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
if (${SkipIfs} EQUAL 0)
#message(STATUS "ENDIF ${makefile_line}")
set (IfElse 0)
set (ElseSeen 0)
else ()
#message(STATUS "skipping ${makefile_line}")
MATH(EXPR SkipIfs "${SkipIfs}-1")
endif ()
continue ()
endif ()
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "ELSE ${makefile_line}")
set (ElseSeen 1)
continue ()
endif()
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
# message(STATUS "skipping ${makefile_line}")
continue ()
if (NOT "${line_match}" STREQUAL "")
if (${SkipIfs} EQUAL 0)
#message(STATUS "ELSE ${makefile_line}")
set (ElseSeen 1)
else ()
#message(STATUS "skipping ${makefile_line}")
endif ()
continue ()
endif()
# Skip the lines that are not part of the path that has to be taken.
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0))
#message(STATUS "skipping ${makefile_line}")
continue ()
endif ()
endif ()
# Skip commented lines (the ones that start with '#')
string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "skipping ${makefile_line}")
continue ()
endif ()
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on ${line_match}")
#message(STATUS "match on ${line_match}")
set(var_name ${CMAKE_MATCH_1})
# set(var_value ${CMAKE_MATCH_2})
#set(var_value ${CMAKE_MATCH_2})
string(STRIP ${CMAKE_MATCH_2} var_value)
# check for Makefile variables in the string, e.g. $(TSUFFIX)
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
@ -54,39 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN)
string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value})
endforeach ()
set(${var_name} ${var_value})
else ()
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on include ${line_match}")
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
else ()
# message(STATUS "unmatched line ${line_match}")
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
# message (STATUS "condition is true")
set (IfElse 1)
else ()
set (IfElse 2)
endif ()
continue ()
endif ()
# Include a new file to be parsed
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on include ${line_match}")
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
continue ()
endif ()
# The if statement that precedes this else has the path taken
# Thus, this else statement has to be skipped.
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "skipping ${makefile_line}")
set (SkipElse 1)
continue()
endif()
# Example 1: ifdef HAVE_MSA
# Example 2: ifndef ZNRM2KERNEL
string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}")
set (ElseSeen 0)
if (DEFINED ${CMAKE_MATCH_2})
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
#message (STATUS "condition is true")
set (IfElse 1)
else ()
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER)
set (CMAKE_MATCH_1 CMAKE_C_COMPILER)
endif ()
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
# message (STATUS "condition is true")
set (IfElse 1)
else ()
set (IfElse 2)
endif ()
endif ()
set (IfElse 2)
endif ()
else ()
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
set (IfElse 2)
else ()
#message (STATUS "condition is true")
set (IfElse 1)
endif ()
endif ()
continue ()
endif ()
# Example 1: ifeq ($(SGEMM_UNROLL_M), 16)
# Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
# Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
# Ignore the second group since (?:...) does not work on cmake
string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}")
if (DEFINED ${CMAKE_MATCH_1})
if (DEFINED ${CMAKE_MATCH_4})
set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}})
else ()
set (STR ${${CMAKE_MATCH_1}})
endif ()
if (${STR} STREQUAL ${CMAKE_MATCH_5})
#message (STATUS "condition is true")
set (IfElse 1)
continue ()
endif ()
endif ()
set (IfElse 2)
continue ()
endif ()
# Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
# Example 2 (Group 4): ifneq ($(C_COMPILER), PGI)
string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}")
set (ElseSeen 0)
set (HasValidGroup 0)
if (DEFINED ${CMAKE_MATCH_3})
set (HasValidGroup 1)
set (STR ${${CMAKE_MATCH_3}})
elseif (NOT ${CMAKE_MATCH_4} STREQUAL "")
set (HasValidGroup 1)
set (STR ${CMAKE_MATCH_4})
endif ()
if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1)
if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR}))
#message (STATUS "condition is true")
set (IfElse 1)
continue ()
endif ()
endif ()
set (IfElse 2)
continue ()
endif ()
#message(STATUS "unmatched line ${line_match}")
endforeach ()
endmacro ()

View File

@ -1,13 +1,14 @@
include ../Makefile.rule
TOPDIR = ..
include $(TOPDIR)/Makefile.system
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
./dgemm_tester
clean ::

View File

@ -120,6 +120,7 @@
#define CORE_SKYLAKEX 28
#define CORE_DHYANA 29
#define CORE_COOPERLAKE 30
#define CORE_SAPPHIRERAPIDS 31
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@ -145,6 +146,7 @@
#define HAVE_AVX512VL (1 << 21)
#define HAVE_AVX2 (1 << 22)
#define HAVE_AVX512BF16 (1 << 23)
#define HAVE_AMXBF16 (1 << 24)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
@ -222,6 +224,7 @@ typedef struct {
#define CPUTYPE_SKYLAKEX 52
#define CPUTYPE_DHYANA 53
#define CPUTYPE_COOPERLAKE 54
#define CPUTYPE_SAPPHIRERAPIDS 55
#define CPUTYPE_HYGON_UNKNOWN 99

View File

@ -26,10 +26,12 @@
*****************************************************************************/
#include <string.h>
#ifdef OS_DARWIN
#ifdef __APPLE__
#include <sys/sysctl.h>
int32_t value;
size_t length=sizeof(value);
int64_t value64;
size_t length64=sizeof(value64);
#endif
#define CPU_UNKNOWN 0
@ -53,6 +55,8 @@ size_t length=sizeof(value);
#define CPU_EMAG8180 10
// Apple
#define CPU_VORTEX 13
// Fujitsu
#define CPU_A64FX 15
static char *cpuname[] = {
"UNKNOWN",
@ -69,7 +73,8 @@ static char *cpuname[] = {
"NEOVERSEN1",
"THUNDERX3T110",
"VORTEX",
"CORTEXA55"
"CORTEXA55",
"A64FX"
};
static char *cpuname_lower[] = {
@ -87,7 +92,8 @@ static char *cpuname_lower[] = {
"neoversen1",
"thunderx3t110",
"vortex",
"cortexa55"
"cortexa55",
"a64fx"
};
int get_feature(char *search)
@ -183,6 +189,9 @@ int detect(void)
// Ampere
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
return CPU_EMAG8180;
// Fujitsu
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
return CPU_A64FX;
}
p = (char *) NULL ;
@ -212,9 +221,9 @@ int detect(void)
}
#else
#ifdef DARWIN
#ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
if (value ==131287967) return CPU_VORTEX;
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
#endif
return CPU_ARMV8;
#endif
@ -265,7 +274,7 @@ int n=0;
printf("#define NUM_CORES %d\n",n);
#endif
#ifdef DARWIN
#ifdef __APPLE__
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
printf("#define NUM_CORES %d\n",value);
#endif
@ -285,154 +294,166 @@ void get_cpuconfig(void)
switch (d)
{
case CPU_CORTEXA53:
case CPU_CORTEXA55:
printf("#define %s\n", cpuname[d]);
// Fall-through
case CPU_ARMV8:
// Minimum parameters for ARMv8 (based on A53)
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
case CPU_CORTEXA53:
case CPU_CORTEXA55:
printf("#define %s\n", cpuname[d]);
// Fall-through
case CPU_ARMV8:
// Minimum parameters for ARMv8 (based on A53)
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_CORTEXA57:
case CPU_CORTEXA72:
case CPU_CORTEXA73:
case CPU_CORTEXA57:
case CPU_CORTEXA72:
case CPU_CORTEXA73:
// Common minimum settings for these Arm cores
// Can change a lot, but we need to be conservative
// TODO: detect info from /sys if possible
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_NEOVERSEN1:
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_NEOVERSEN1:
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX:
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 16777216\n");
printf("#define L2_LINESIZE 128\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX:
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 16777216\n");
printf("#define L2_LINESIZE 128\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX2T99:
printf("#define THUNDERX2T99 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_THUNDERX2T99:
printf("#define THUNDERX2T99 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_EMAG8180:
// Minimum parameters for ARMv8 (based on A53)
printf("#define EMAG8180\n");
printf("#define L1_CODE_SIZE 32768\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_EMAG8180:
// Minimum parameters for ARMv8 (based on A53)
printf("#define EMAG8180\n");
printf("#define L1_CODE_SIZE 32768\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_THUNDERX3T110:
printf("#define THUNDERX3T110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 524288 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 94371840 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#ifdef DARWIN
case CPU_VORTEX:
printf("#define VORTEX \n");
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0);
printf("#define L1_CODE_SIZE %d \n",value);
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0);
printf("#define L1_CODE_LINESIZE %d \n",value);
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
printf("#define L1_DATA_SIZE %d \n",value);
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
printf("#define L2_SIZE %d \n",value);
break;
case CPU_THUNDERX3T110:
printf("#define THUNDERX3T110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 524288 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 94371840 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#ifdef __APPLE__
case CPU_VORTEX:
printf("#define VORTEX \n");
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_LINESIZE %lld \n",value64);
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
printf("#define L1_DATA_SIZE %lld \n",value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#endif
case CPU_A64FX:
printf("#define A64FX\n");
printf("#define L1_CODE_SIZE 65535\n");
printf("#define L1_DATA_SIZE 65535\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L2_SIZE 8388608\n");
printf("#define L2_LINESIZE 256\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
get_cpucount();
}

View File

@ -165,6 +165,7 @@ void get_cpuconfig(void){
}else{
printf("#define UNKNOWN\n");
}
if (!get_feature(msa)) printf("#define NO_MSA\n");
}
void get_libname(void){
@ -178,3 +179,38 @@ void get_libname(void){
printf("mips\n");
}
}
int get_feature(char *search)
{
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, search)) { return(1); }
}
#endif
return(0);
}

View File

@ -104,17 +104,17 @@ int detect(void){
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
return CPU_LOONGSON3R3;
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
return CPU_LOONGSON3R4;
} else{
return CPU_SICORTEX;
if (p != NULL){
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
return CPU_LOONGSON3R3;
} else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
return CPU_LOONGSON3R4;
} else{
return CPU_SICORTEX;
}
}
#endif
return CPU_UNKNOWN;
}
}
char *get_corename(void){
@ -201,6 +201,7 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}
if (!get_feature(msa)) printf("#define NO_MSA\n");
}
void get_libname(void){
@ -218,3 +219,38 @@ void get_libname(void){
printf("mips64\n");
}
}
int get_feature(char *search)
{
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, search)) { return(1); }
}
#endif
return(0);
}

View File

@ -1,3 +1,4 @@
//{
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@ -266,6 +267,31 @@ int support_avx512_bf16(){
#endif
}
#define BIT_AMX_TILE 0x01000000
#define BIT_AMX_BF16 0x00400000
#define BIT_AMX_ENBD 0x00060000
int support_amx_bf16() {
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx512())
return 0;
// CPUID.7.0:EDX indicates AMX support
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
// CPUID.D.0:EAX[17:18] indicates AMX enabled
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
ret = 1;
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){
int eax, ebx, ecx, edx;
char vendor[13];
@ -353,6 +379,7 @@ int get_cputype(int gettype){
if (support_avx2()) feature |= HAVE_AVX2;
if (support_avx512()) feature |= HAVE_AVX512VL;
if (support_avx512_bf16()) feature |= HAVE_AVX512BF16;
if (support_amx_bf16()) feature |= HAVE_AMXBF16;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif
@ -1429,10 +1456,10 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
switch (model) {
case 12: // Tiger Lake
case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz)
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
@ -1448,19 +1475,10 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
case 10: //family 6 exmodel 10
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 7: // Rocket Lake
if(support_avx512())
case 15: // Sapphire Rapids
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
@ -1468,8 +1486,57 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
}
break;
case 9:
switch (model) {
case 7: // Alder Lake desktop
case 10: // Alder Lake mobile
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 13: // Ice Lake NNPI
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14: // Kaby Lake and refreshes
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 10: //family 6 exmodel 10
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 7: // Rocket Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
}
break;
case 0x7:
@ -2042,32 +2109,7 @@ int get_coretype(void){
return CORE_NEHALEM;
}
break;
case 10:
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())
return CORE_SKYLAKEX;
#endif
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;
#endif
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
case 5:
switch (model) {
case 6:
@ -2121,6 +2163,7 @@ int get_coretype(void){
return CORE_NEHALEM;
}
break;
case 6:
if (model == 6)
#ifndef NO_AVX512
@ -2135,7 +2178,7 @@ int get_coretype(void){
else
return CORE_NEHALEM;
#endif
if (model == 10)
if (model == 10 || model == 12)
#ifndef NO_AVX512
if(support_avx512_bf16())
return CORE_COOPERLAKE;
@ -2151,10 +2194,11 @@ int get_coretype(void){
return CORE_NEHALEM;
#endif
break;
case 7:
if (model == 10)
return CORE_NEHALEM;
if (model == 14)
if (model == 13 || model == 14) // Ice Lake
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
@ -2168,9 +2212,9 @@ int get_coretype(void){
return CORE_NEHALEM;
#endif
break;
case 9:
case 8:
if (model == 12) { // Tiger Lake
if (model == 12 || model == 13) { // Tiger Lake
if(support_avx512())
return CORE_SKYLAKEX;
if(support_avx2())
@ -2180,7 +2224,7 @@ int get_coretype(void){
else
return CORE_NEHALEM;
}
if (model == 14) { // Kaby Lake
if (model == 14) { // Kaby Lake mobile
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
@ -2190,12 +2234,82 @@ int get_coretype(void){
else
return CORE_NEHALEM;
}
}
if (model == 15) { // Sapphire Rapids
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9:
if (model == 7 || model == 10) { // Alder Lake
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
if (model == 13) { // Ice Lake NNPI
if(support_avx512())
return CORE_SKYLAKEX;
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
if (model == 14) { // Kaby Lake desktop
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
}
break;
case 10:
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())
return CORE_SKYLAKEX;
#endif
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;
#endif
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
case 15:
if (model <= 0x2) return CORE_NORTHWOOD;
else return CORE_PRESCOTT;
}
}
}
@ -2389,6 +2503,7 @@ void get_cpuconfig(void){
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n");
if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
@ -2460,9 +2575,11 @@ void get_sse(void){
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n");
if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
}
//}

View File

@ -27,57 +27,11 @@
#include <string.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_Z15 3
#include "cpuid_zarch.h"
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13",
"Z14",
"Z15"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13",
"z14",
"z15"
};
int detect(void)
{
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/sysinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Type", buffer, 4)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
return CPU_GENERIC;
}
void get_libname(void)
{
int d = detect();
printf("%s", cpuname_lower[d]);
}

101
cpuid_zarch.h Normal file
View File

@ -0,0 +1,101 @@
#include <stdlib.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_Z15 3
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13",
"Z14",
"Z15"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13",
"z14",
"z15"
};
// Guard the use of getauxval() on glibc version >= 2.16
#ifdef __GLIBC__
#include <features.h>
#if __GLIBC_PREREQ(2, 16)
#include <sys/auxv.h>
#define HAVE_GETAUXVAL 1
static unsigned long get_hwcap(void)
{
unsigned long hwcap = getauxval(AT_HWCAP);
char *maskenv;
// honor requests for not using specific CPU features in LD_HWCAP_MASK
maskenv = getenv("LD_HWCAP_MASK");
if (maskenv)
hwcap &= strtoul(maskenv, NULL, 0);
return hwcap;
// note that a missing auxval is interpreted as no capabilities
// available, which is safe.
}
#else // __GLIBC_PREREQ(2, 16)
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
static unsigned long get_hwcap(void) {
// treat missing support for getauxval() as no capabilities available,
// which is safe.
return 0;
}
#endif // __GLIBC_PREREQ(2, 16)
#endif // __GLIBC
static int detect(void)
{
unsigned long hwcap = get_hwcap();
// Choose the architecture level for optimized kernels based on hardware
// capability bits (just like glibc chooses optimized implementations).
//
// The hardware capability bits that are used here indicate both
// hardware support for a particular ISA extension and the presence of
// software support to enable its use. For example, when HWCAP_S390_VX
// is set then both the CPU can execute SIMD instructions and the Linux
// kernel can manage applications using the vector registers and SIMD
// instructions.
//
// See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in
// sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware
// capability bits. They are derived from the information that the
// "store facility list (extended)" instructions provide.
// (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD)
//
// currently used:
// HWCAP_S390_VX - vector facility for z/Architecture (introduced with
// IBM z13), enables level CPU_Z13 (SIMD)
// HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM
// z14), together with VX enables level CPU_Z14
// (single-precision SIMD instructions)
//
// When you add optimized kernels that make use of other ISA extensions
// (e.g., for exploiting the vector-enhancements facility 2 that was introduced
// with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate
// it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2
// for the z15 vector enhancements).
//
// To learn the value of hwcaps on a given system, set the environment
// variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running
// LD_SHOW_AUXV=1 /bin/true).
// Also, the init function for dynamic arch support will print hwcaps
// when OPENBLAS_VERBOSE is set to 2 or higher.
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
return CPU_Z14;
if (hwcap & HWCAP_S390_VX)
return CPU_Z13;
return CPU_GENERIC;
}

View File

@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else

View File

@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Split local region of B into parts */
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
min_jj = MIN(n_to, js + div_n) - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else

View File

@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else

View File

@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < ls - js; jjs += min_jj){
min_jj = ls - js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
min_jj = js - ls - min_l - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE)
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else

View File

@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 ""
if (DYNAMIC_ARCH)
if (ARM64)
list(APPEND COMMON_SOURCES dynamic_arm64.c)
elseif (POWER)
list(APPEND COMMON_SOURCES dynamic_power.c)
else ()
list(APPEND COMMON_SOURCES dynamic.c)
endif ()

View File

@ -40,7 +40,7 @@
#include <stdlib.h>
#include "common.h"
#if defined(OS_CYGWIN_NT) && !defined(unlikely)
#if !defined(unlikely)
#ifdef __GNUC__
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
@ -391,8 +391,9 @@ int blas_thread_init(void){
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
#if defined(SMP_SERVER)
// Handle lazy re-init of the thread-pool after a POSIX fork
// on Cygwin or as delayed init when a static library is used
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif

View File

@ -624,7 +624,7 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM;
}
}
if (model == 10) {
if (model == 10 || model == 12){
// Ice Lake SP
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
@ -644,7 +644,7 @@ static gotoblas_t *get_coretype(void){
case 7:
if (model == 10) // Goldmont Plus
return &gotoblas_NEHALEM;
if (model == 14) {
if (model == 13 || model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
@ -661,9 +661,8 @@ static gotoblas_t *get_coretype(void){
}
}
return NULL;
case 9:
case 8:
if (model == 12) { // Tiger Lake
if (model == 12 || model == 13) { // Tiger Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
@ -689,6 +688,50 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 15){ // Sapphire Rapids
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
if (model == 7 || model == 10) { // Alder Lake
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 10:
if (model == 5 || model == 6) {
if(support_avx2())
@ -1018,7 +1061,13 @@ void gotoblas_dynamic_init(void) {
#ifdef ARCH_X86
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
if (gotoblas == NULL) {
if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE;
else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX;
else if (support_avx2()) gotoblas = &gotoblas_HASWELL;
else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE;
else gotoblas = &gotoblas_PRESCOTT;
}
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
if (sizeof(void*) == 8) {
if (gotoblas == &gotoblas_KATMAI ||

View File

@ -1,38 +1,7 @@
#include "common.h"
#include "cpuid_zarch.h"
#include <stdbool.h>
// Guard the use of getauxval() on glibc version >= 2.16
#ifdef __GLIBC__
#include <features.h>
#if __GLIBC_PREREQ(2, 16)
#include <sys/auxv.h>
#define HAVE_GETAUXVAL 1
static unsigned long get_hwcap(void)
{
unsigned long hwcap = getauxval(AT_HWCAP);
char *maskenv;
// honor requests for not using specific CPU features in LD_HWCAP_MASK
maskenv = getenv("LD_HWCAP_MASK");
if (maskenv)
hwcap &= strtoul(maskenv, NULL, 0);
return hwcap;
// note that a missing auxval is interpreted as no capabilities
// available, which is safe.
}
#else // __GLIBC_PREREQ(2, 16)
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
static unsigned long get_hwcap(void) {
// treat missing support for getauxval() as no capabilities available,
// which is safe.
return 0;
}
#endif // __GLIBC_PREREQ(2, 16)
#endif // __GLIBC
extern gotoblas_t gotoblas_ZARCH_GENERIC;
#ifdef DYN_Z13
@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14;
#define NUM_CORETYPES 4
extern int openblas_verbose();
extern void openblas_warning(int verbose, const char* msg);
static char* corename[] = {
"unknown",
"Z13",
"Z14",
"ZARCH_GENERIC",
};
char* gotoblas_corename(void) {
#ifdef DYN_Z13
if (gotoblas == &gotoblas_Z13) return corename[1];
if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13];
#endif
#ifdef DYN_Z14
if (gotoblas == &gotoblas_Z14) return corename[2];
if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14];
#endif
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC];
return corename[0];
return "unknown";
}
#ifndef HWCAP_S390_VXE
@ -79,25 +42,28 @@ char* gotoblas_corename(void) {
*/
static gotoblas_t* get_coretype(void) {
unsigned long hwcap __attribute__((unused)) = get_hwcap();
int cpu = detect();
#ifdef DYN_Z14
switch(cpu) {
// z14 and z15 systems: exploit Vector Facility (SIMD) and
// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
case CPU_Z14:
#ifdef DYN_Z14
return &gotoblas_Z14;
#endif
#ifdef DYN_Z13
// z13: Vector Facility (SIMD for double)
if (hwcap & HWCAP_S390_VX)
case CPU_Z13:
#ifdef DYN_Z13
return &gotoblas_Z13;
#endif
default:
// fallback in case of missing compiler support, systems before z13, or
// when the OS does not advertise support for the Vector Facility (e.g.,
// missing support in the OS kernel)
return &gotoblas_ZARCH_GENERIC;
return &gotoblas_ZARCH_GENERIC;
}
}
static gotoblas_t* force_coretype(char* coretype) {
@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) {
for (i = 0; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
if (!strncasecmp(coretype, cpuname[i], 20))
{
found = i;
break;
}
}
if (found == 1) {
if (found == CPU_Z13) {
#ifdef DYN_Z13
return &gotoblas_Z13;
#else
openblas_warning(1, "Z13 support not compiled in");
return NULL;
#endif
} else if (found == 2) {
} else if (found == CPU_Z14) {
#ifdef DYN_Z14
return &gotoblas_Z14;
#else
openblas_warning(1, "Z14 support not compiled in");
return NULL;
#endif
} else if (found == 3) {
} else if (found == CPU_GENERIC) {
return &gotoblas_ZARCH_GENERIC;
}
@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) {
else
{
gotoblas = get_coretype();
if (openblas_verbose() >= 2) {
snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n",
getauxval(AT_HWCAP));
openblas_warning(2, coremsg);
}
}
if (gotoblas == NULL)
@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) {
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
if (openblas_verbose() >= 2) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
}
gotoblas->init();
}
else {

View File

@ -246,6 +246,14 @@ int get_num_procs(void) {
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if defined(USE_OPENMP)
#if _OPENMP >= 201511
nums = omp_get_num_places();
#endif
return nums;
#endif
#if !defined(OS_LINUX)
return nums;
#endif
@ -1806,6 +1814,15 @@ int get_num_procs(void) {
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if defined(USE_OPENMP)
/* if (omp_get_proc_bind() != omp_proc_bind_false) */
#if _OPENMP >= 201511
nums = omp_get_num_places();
#endif
return nums;
#endif
#if !defined(OS_LINUX)
return nums;
#endif
@ -2854,32 +2871,28 @@ void *blas_memory_alloc(int procpos){
position ++;
} while (position < NUM_BUFFERS);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
if (memory_overflowed) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
RMB;
do {
RMB;
#if defined(USE_OPENMP)
if (!newmemory[position-NUM_BUFFERS].used) {
blas_lock(&newmemory[position-NUM_BUFFERS].lock);
if (!newmemory[position-NUM_BUFFERS].used) {
blas_lock(&newmemory[position-NUM_BUFFERS].lock);
#endif
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
#if defined(USE_OPENMP)
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
}
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
}
#endif
position ++;
position ++;
} while (position < 512+NUM_BUFFERS);
} while (position < 512+NUM_BUFFERS);
}
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
goto error;
allocation :
@ -2904,7 +2917,7 @@ void *blas_memory_alloc(int procpos){
func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) {
while ((*func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
@ -2984,6 +2997,9 @@ void *blas_memory_alloc(int procpos){
return (void *)memory[position].addr;
error:
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
if (memory_overflowed) goto terminate;
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
memory_overflowed=1;
@ -2997,7 +3013,6 @@ void *blas_memory_alloc(int procpos){
newmemory[i].used = 0;
newmemory[i].lock = 0;
}
newmemory[position-NUM_BUFFERS].used = 1;
allocation2:
newmemory[position-NUM_BUFFERS].used = 1;
@ -3015,7 +3030,7 @@ allocation2:
func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) {
while ((*func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
@ -3069,6 +3084,9 @@ allocation2:
return (void *)newmemory[position-NUM_BUFFERS].addr;
terminate:
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");

View File

@ -183,7 +183,7 @@ int get_L2_size(void){
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE)
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -269,7 +269,7 @@ void blas_set_parameter(void){
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
defined(SKYLAKEX) || defined(COOPERLAKE)
defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
int size = 16;
#else
int size = get_L2_size();

View File

@ -469,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif
#ifdef FORCE_SAPPHIRERAPIDS
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX512
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
#else
#define SUBARCHITECTURE "SAPPHIRERAPIDS"
#define ARCHCONFIG "-DSAPPHIRERAPIDS " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids"
#define LIBNAME "sapphirerapids"
#define CORENAME "SAPPHIRERAPIDS"
#endif
#endif
#ifdef FORCE_ATOM
#define FORCE
#define FORCE_INTEL
@ -964,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DP5600 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
#define LIBNAME "p5600"
#define CORENAME "P5600"
#else
@ -978,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DMIPS1004K " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
#define LIBNAME "mips1004K"
#define CORENAME "MIPS1004K"
#else
@ -992,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DMIPS24K " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=32768 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
#define LIBNAME "mips24K"
#define CORENAME "MIPS24K"
#else
@ -1149,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_ARMV8SVE
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8SVE"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DARMV8SVE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "armv8sve"
#define CORENAME "ARMV8SVE"
#endif
#ifdef FORCE_ARMV8
#define FORCE
@ -1375,6 +1438,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "VORTEX"
#endif
#ifdef FORCE_A64FX
#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "A64FX"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DA64FX " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "a64fx"
#define CORENAME "A64FX"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"

View File

@ -188,12 +188,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (n == 0) return;
if (incx == 1 && trans == 0 && n < 50) {
buffer = NULL;
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@ -42,14 +42,20 @@
#include "functable.h"
#endif
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
FLOAT c = *C;
FLOAT s = *S;
#else
void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) {
FLOAT *x = (FLOAT*) VX;
FLOAT *y = (FLOAT*) VY;
#endif /* CBLAS */
PRINT_DEBUG_NAME;
if (n <= 0) return;

View File

@ -4,8 +4,16 @@
#include "functable.h"
#endif
#ifndef CBLAS
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
#else
void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
FLOAT *DA = (FLOAT*) VDA;
FLOAT *DB = (FLOAT*) VDB;
FLOAT *S = (FLOAT*) VS;
#endif /* CBLAS */
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
long double da_r = *(DA + 0);

View File

@ -199,12 +199,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (n == 0) return;
if (incx == 1 && trans == 0 && n < 50) {
buffer = NULL;
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
if (${DYNAMIC_ARCH})
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
endif ()
ParseMakefileVars("${KERNELDIR}/KERNEL")
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
SetDefaultL1()
SetDefaultL2()
SetDefaultL3()
ParseMakefileVars("${KERNELDIR}/KERNEL")
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h)
if(NOT NO_LAPACK)
@ -198,7 +198,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
# Makefile.L3
set(USE_TRMM false)
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE))
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS))
set(USE_TRMM true)
endif ()
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
# symm for s and d
if (NOT DEFINED ${float_char}SYMMUCOPY_M)
set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
else ()
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
endif()
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define.
# Could simplify it a bit by pairing up by -UUNIT/-DUNIT.
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
else ()
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
endif ()
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
@ -578,11 +596,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
endif ()
if (BUILD_BFLOAT16)
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
@ -616,11 +634,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
endif ()
endif ()

View File

@ -31,7 +31,22 @@ ifdef NO_AVX2
endif
ifdef TARGET_CORE
ifeq ($(TARGET_CORE), COOPERLAKE)
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1)
override CFLAGS += -march=sapphirerapids
else
override CFLAGS += -march=skylake-avx512 -mavx512f
endif
ifeq ($(OSNAME), CYGWIN_NT)
override CFLAGS += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
override CFLAGS += -fno-asynchronous-unwind-tables
endif
endif
else ifeq ($(TARGET_CORE), COOPERLAKE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1)
override CFLAGS += -march=cooperlake

View File

@ -47,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE)
USE_TRMM = 1
endif
ifeq ($(CORE), SAPPHIRERAPIDS)
USE_TRMM = 1
endif
ifeq ($(CORE), ZEN)
USE_TRMM = 1
endif
@ -1479,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT
$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
ifdef STRMMUNCOPY_M
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef STRMMLNCOPY_M
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
ifdef STRMMUTCOPY_M
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef STRMMLTCOPY_M
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@ -1527,29 +1563,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N
$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
ifdef DTRMMUNCOPY_M
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef DTRMMLNCOPY_M
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
ifdef DTRMMUTCOPY_M
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef DTRMMLTCOPY_M
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@ -1773,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).
$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
ifdef SSYMMUCOPY_M
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
else
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
endif
ifdef SSYMMLCOPY_M
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
else
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
endif
$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
@ -1785,11 +1863,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).
$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
ifdef DSYMMUCOPY_M
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
else
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
endif
ifdef DSYMMLCOPY_M
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
else
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
endif
$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@

183
kernel/arm64/KERNEL.A64FX Normal file
View File

@ -0,0 +1,183 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -0,0 +1,183 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
@ -169,7 +169,7 @@ endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c

View File

@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
@ -169,7 +169,7 @@ endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c

View File

@ -1 +1 @@
include $(KERNELDIR)/KERNEL.ARMV8
include $(KERNELDIR)/KERNEL.NEOVERSEN1

View File

@ -0,0 +1,898 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmla "
#define FMLA_II "fmls "
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define FMLA_RI "fmls "
#define FMLA_IR "fmla "
#define FMLA_II "fmla "
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmls "
#define FMLA_II "fmla "
#else
#define FMLA_RI "fmls "
#define FMLA_IR "fmls "
#define FMLA_II "fmls "
#endif
#define FMLA_RR "fmla "
static inline void store_m8n1_contracted(float *C,
float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i,
float alphar, float alphai) {
float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8);
ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar);
ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar);
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai);
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai);
ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai);
ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai);
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar);
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar);
vst2q_f32(C, ld1);
vst2q_f32(C + 8, ld2);
}
static inline void kernel_8x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
const float *c_pref = C;
float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i;
float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i;
/** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */
/** v0-v1 and v10-v11 for B, v2-v9 for A */
__asm__ __volatile__(
"cmp %[K],#0; mov %[c_pref],%[C]\n\t"
"movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c6i].16b,#0\n\t"
"movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t"
"movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
"beq 4f\n\t"
"cmp %[K],#2\n\t"
"ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t"
"ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t"
"mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t"
"bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t"
"bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t"
"blt 3f; beq 2f\n\t"
"1:\n\t"
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t"
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
"fmov v5.d[1],x0; fmov d1,x2\n\t"
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t"
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t"
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t"
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
"fmov v7.d[1],x0; fmov d10,x5\n\t"
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t"
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
"fmov v10.d[1],x6; fmov d11,x2\n\t"
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t"
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t"
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t"
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t"
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t"
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
"fmov v9.d[1],x0; fmov d0,x5\n\t"
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t"
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t"
FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
"fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t"
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t"
FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t"
"fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t"
FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t"
FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t"
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t"
FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t"
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t"
FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t"
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t"
FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t"
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t"
FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
"bgt 1b; blt 3f\n\t"
"2:\n\t"
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t"
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
"fmov v5.d[1],x0; fmov d1,x2\n\t"
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t"
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t"
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t"
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
"fmov v7.d[1],x0; fmov d10,x5\n\t"
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t"
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
"fmov v10.d[1],x6; fmov d11,x2\n\t"
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t"
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t"
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t"
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t"
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
"fmov v9.d[1],x0\n\t"
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t"
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t"
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t"
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t"
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
"b 4f\n\t"
"3:\n\t"
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t"
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t"
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
"fmov v5.d[1],x0; fmov d1,x2\n\t"
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t"
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t"
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
"fmov v1.d[1],x4\n\t"
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t"
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t"
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t"
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t"
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t"
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t"
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t"
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
"4:\n\t"
"mov %[c_pref],%[C]\n\t"
"zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
"zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t"
"zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
"zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t"
"zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip2 %[c2i].2d,v4.2d,v5.2d\n\t"
"zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t"
"zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t"
"zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t"
"zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t"
"zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t"
"zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t"
"zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t"
"zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t"
"zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t"
"zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t"
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref)
:[C]"r"(C), [LDC]"r"(LDC)
:"cc","memory","x0","x1","x2","x3","x4","x5","x6",
"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11");
store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2;
store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2;
store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2;
store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai);
}
static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc,
float32x4_t a, float32x4_t b) {
acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0);
acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1);
acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2);
acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3);
return acc;
}
static inline float32x4x4_t expand_alpha(float alphar, float alphai) {
float32x4x4_t ret;
const float maskp[] = { -1, 1, -1, 1 };
const float maskn[] = { 1, -1, 1, -1 };
const float32x4_t vrevp = vld1q_f32(maskp);
const float32x4_t vrevn = vld1q_f32(maskn);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
ret.val[0] = vdupq_n_f32(alphar);
ret.val[1] = vdupq_n_f32(-alphai);
ret.val[2] = vmulq_f32(ret.val[1], vrevn);
ret.val[3] = vmulq_f32(ret.val[0], vrevp);
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
ret.val[0] = vdupq_n_f32(alphar);
ret.val[1] = vdupq_n_f32(alphai);
ret.val[2] = vmulq_f32(ret.val[1], vrevp);
ret.val[3] = vmulq_f32(ret.val[0], vrevn);
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
ret.val[2] = vdupq_n_f32(alphai);
ret.val[3] = vdupq_n_f32(alphar);
ret.val[0] = vmulq_f32(ret.val[3], vrevn);
ret.val[1] = vmulq_f32(ret.val[2], vrevp);
#else
ret.val[2] = vdupq_n_f32(alphai);
ret.val[3] = vdupq_n_f32(-alphar);
ret.val[0] = vmulq_f32(ret.val[3], vrevp);
ret.val[1] = vmulq_f32(ret.val[2], vrevn);
#endif
return ret;
}
static inline void store_expanded_m2n2(float *C, BLASLONG LDC,
float32x4x4_t acc, float32x4x4_t expanded_alpha) {
float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]);
acc.val[0] = vrev64q_f32(acc.val[0]);
acc.val[2] = vrev64q_f32(acc.val[2]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]);
acc.val[1] = vrev64q_f32(acc.val[1]);
acc.val[3] = vrev64q_f32(acc.val[3]);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]);
vst1q_f32(C, ld1);
vst1q_f32(C + LDC * 2, ld2);
}
static inline float32x4x4_t init_expanded_m2n2() {
float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0),
vdupq_n_f32(0), vdupq_n_f32(0) }};
return ret;
}
static inline void kernel_4x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4),
b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a1, b2);
c4 = acc_expanded_m2n2(c4, a2, b2);
c1 = acc_expanded_m2n2(c1, a3, b3);
c2 = acc_expanded_m2n2(c2, a4, b3);
c3 = acc_expanded_m2n2(c3, a3, b4);
c4 = acc_expanded_m2n2(c4, a4, b4);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a1, b2);
c4 = acc_expanded_m2n2(c4, a2, b2);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
C += LDC * 4;
store_expanded_m2n2(C, LDC, c3, e_alpha);
store_expanded_m2n2(C + 4, LDC, c4, e_alpha);
}
static inline void kernel_8x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20);
float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a3, b1);
c4 = acc_expanded_m2n2(c4, a4, b1);
c1 = acc_expanded_m2n2(c1, a5, b2);
c2 = acc_expanded_m2n2(c2, a6, b2);
c3 = acc_expanded_m2n2(c3, a7, b2);
c4 = acc_expanded_m2n2(c4, a8, b2);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
float32x4_t b1 = vld1q_f32(sb);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a3, b1);
c4 = acc_expanded_m2n2(c4, a4, b1);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
store_expanded_m2n2(C + 8, LDC, c3, e_alpha);
store_expanded_m2n2(C + 12, LDC, c4, e_alpha);
}
static inline void kernel_4x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2;
c1 = c2 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c1 = acc_expanded_m2n2(c1, a3, b2);
c2 = acc_expanded_m2n2(c2, a4, b2);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t b1 = vld1q_f32(sb);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
}
static inline void kernel_2x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2;
c1 = c2 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a1, b2);
c1 = acc_expanded_m2n2(c1, a2, b3);
c2 = acc_expanded_m2n2(c2, a2, b4);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa);
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a1, b2);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha);
}
static inline void kernel_2x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2;
c1 = c2 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b2);
}
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]);
c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]);
if (K) {
float32x4_t a1 = vld1q_f32(sa);
float32x4_t b1 = vld1q_f32(sb);
c1 = acc_expanded_m2n2(c1, a1, b1);
}
store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai));
}
static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc,
float32x4_t a, float32x2_t b) {
acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0);
acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1);
return acc;
}
static inline void store_expanded_m2n1(float *C,
float32x4x2_t acc, float32x4x4_t expanded_alpha) {
float32x4_t ld1 = vld1q_f32(C);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
acc.val[0] = vrev64q_f32(acc.val[0]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
acc.val[1] = vrev64q_f32(acc.val[1]);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
vst1q_f32(C, ld1);
}
static inline float32x4x2_t init_expanded_m2n1() {
float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }};
return ret;
}
static inline void kernel_8x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x4x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n1();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12),
a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20),
a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
c3 = acc_expanded_m2n1(c3, a3, b1);
c4 = acc_expanded_m2n1(c4, a4, b1);
c1 = acc_expanded_m2n1(c1, a5, b2);
c2 = acc_expanded_m2n1(c2, a6, b2);
c3 = acc_expanded_m2n1(c3, a7, b2);
c4 = acc_expanded_m2n1(c4, a8, b2);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
float32x2_t b1 = vld1_f32(sb);
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
c3 = acc_expanded_m2n1(c3, a3, b1);
c4 = acc_expanded_m2n1(c4, a4, b1);
}
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n1(C, c1, expanded_alpha);
store_expanded_m2n1(C + 4, c2, expanded_alpha);
store_expanded_m2n1(C + 8, c3, expanded_alpha);
store_expanded_m2n1(C + 12, c4, expanded_alpha);
}
static inline void kernel_4x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x4x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n1();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
c3 = acc_expanded_m2n1(c3, a3, b2);
c4 = acc_expanded_m2n1(c4, a4, b2);
}
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x2_t b1 = vld1_f32(sb);
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
}
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n1(C, c1, expanded_alpha);
store_expanded_m2n1(C + 4, c2, expanded_alpha);
}
static inline void kernel_2x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x4x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n1();
for (; K > 3; K -= 4) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2),
b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8;
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b2);
c3 = acc_expanded_m2n1(c3, a3, b3);
c4 = acc_expanded_m2n1(c4, a4, b4);
}
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
for (; K; K--) {
float32x4_t a1 = vld1q_f32(sa); sa += 4;
float32x2_t b1 = vld1_f32(sb); sb += 2;
c1 = acc_expanded_m2n1(c1, a1, b1);
}
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n1(C, c1, expanded_alpha);
}
static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) {
float32x2x4_t ret;
const float maskp[] = { -1, 1 };
const float maskn[] = { 1, -1 };
const float32x2_t vrevp = vld1_f32(maskp);
const float32x2_t vrevn = vld1_f32(maskn);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
ret.val[0] = vdup_n_f32(alphar);
ret.val[1] = vdup_n_f32(-alphai);
ret.val[2] = vmul_f32(ret.val[1], vrevn);
ret.val[3] = vmul_f32(ret.val[0], vrevp);
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
ret.val[0] = vdup_n_f32(alphar);
ret.val[1] = vdup_n_f32(alphai);
ret.val[2] = vmul_f32(ret.val[1], vrevp);
ret.val[3] = vmul_f32(ret.val[0], vrevn);
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
ret.val[2] = vdup_n_f32(alphai);
ret.val[3] = vdup_n_f32(alphar);
ret.val[0] = vmul_f32(ret.val[3], vrevn);
ret.val[1] = vmul_f32(ret.val[2], vrevp);
#else
ret.val[2] = vdup_n_f32(alphai);
ret.val[3] = vdup_n_f32(-alphar);
ret.val[0] = vmul_f32(ret.val[3], vrevp);
ret.val[1] = vmul_f32(ret.val[2], vrevn);
#endif
return ret;
}
static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc,
float32x2_t a, float32x2_t b) {
acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0);
acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1);
return acc;
}
static inline void store_expanded_m1n1(float *C,
float32x2x2_t acc, float32x2x4_t expanded_alpha) {
float32x2_t ld1 = vld1_f32(C);
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]);
acc.val[0] = vrev64_f32(acc.val[0]);
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]);
acc.val[1] = vrev64_f32(acc.val[1]);
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]);
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]);
vst1_f32(C, ld1);
}
static inline float32x2x2_t init_expanded_m1n1() {
float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }};
return ret;
}
static inline void kernel_1x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m1n1();
for (; K; K--) {
float32x2_t a1 = vld1_f32(sa); sa += 2;
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4));
c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6));
sb += 8;
}
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c4, expanded_alpha);
}
static inline void kernel_1x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m1n1();
for (; K > 1; K -= 2) {
float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4;
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4));
c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6));
sb += 8;
}
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
if (K) {
float32x2_t a1 = vld1_f32(sa);
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
}
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c2, expanded_alpha);
}
static inline void kernel_1x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m1n1();
for (; K > 3; K -= 4) {
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2));
c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4));
c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6));
sa += 8; sb += 8;
}
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
c1.val[0] = vadd_f32(c1.val[0], c2.val[0]);
c1.val[1] = vadd_f32(c1.val[1], c2.val[1]);
for (; K; K--) {
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
sa += 2; sb += 2;
}
store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai));
}
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
BLASLONG n_left = N;
for (; n_left >= 8; n_left -= 8) {
const FLOAT *a_ = sa;
FLOAT *c1_ = C;
FLOAT *c2_ = C + LDC * 8;
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + K * 8;
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC);
a_ += 16 * K;
c1_ += 16;
c2_ += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC);
a_ += 8 * K;
c1_ += 8;
c2_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC);
a_ += 4 * K;
c1_ += 4;
c2_ += 4;
}
if (m_left) {
kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC);
}
C += 16 * LDC;
sb += 16 * K;
}
if (n_left >= 4) {
n_left -= 4;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 16 * K;
c_ += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC);
}
C += 8 * LDC;
sb += 8 * K;
}
if (n_left >= 2) {
n_left -= 2;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 16 * K;
c_ += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC);
}
C += 4 * LDC;
sb += 4 * K;
}
if (n_left) {
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x1(sa, sb, C, alphar, alphai, K);
sa += 16 * K;
C += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x1(sa, sb, C, alphar, alphai, K);
sa += 8 * K;
C += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x1(sa, sb, C, alphar, alphai, K);
sa += 4 * K;
C += 4;
}
if (m_left) {
kernel_1x1(sa, sb, C, alphar, alphai, K);
}
}
return 0;
}

View File

@ -0,0 +1,890 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
/**********************************************************
* Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12
* Operation: C[4][12] += alpha * sa[4][K] * sb[K][12]
* Matrix orders:
* sa: column-major (leading dimension == 4)
* sb: 3 concatenated row-major 4-column submatrices
* C: column-major (leading dimension == LDC)
*********************************************************/
static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
/** prefetch 4x12 elements from matrix C for RW purpose */
__asm__ __volatile__(
"mov x0,%[C]\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t"
::[C]"r"(C), [LDC]"r"(LDC):"x0");
/** 3 pointers to 3 submatrices of sb respectively */
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + K * 4;
const FLOAT *b3_ = sb + K * 8;
/** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */
/** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */
/** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */
/** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */
/** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */
__asm__ __volatile__(
"cmp %[K],#0\n\t"
/** fill registers holding elements of C with 0.0 */
"movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t"
"movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t"
"movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t"
"movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t"
"movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t"
"movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t"
"beq 4f; cmp %[K],#2\n\t"
/** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */
"ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t"
"ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t"
"blt 3f; beq 2f\n\t"
"1:\n\t"
/** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
"fmla v12.2d,v0.2d,v5.d[0]\n\t"
"fmla v13.2d,v1.2d,v5.d[0]\n\t"
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
"fmla v18.2d,v0.2d,v6.d[1]\n\t"
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
"fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t"
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
"fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t"
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
"ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t"
"fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t"
"fmla v9.2d,v3.2d,v6.d[0]\n\t"
"fmla v10.2d,v2.2d,v6.d[1]\n\t"
"ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t"
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
"fmla v13.2d,v3.2d,v7.d[0]\n\t"
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
"fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t"
"fmla v16.2d,v2.2d,v4.d[0]\n\t"
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
"fmla v19.2d,v3.2d,v4.d[1]\n\t"
"ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t"
"fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t"
"fmla v21.2d,v3.2d,v5.d[0]\n\t"
"fmla v22.2d,v2.2d,v5.d[1]\n\t"
"ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t"
"fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t"
"fmla v24.2d,v2.2d,v6.d[0]\n\t"
"fmla v25.2d,v3.2d,v6.d[0]\n\t"
"ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t"
"fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t"
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
"fmla v28.2d,v2.2d,v7.d[0]\n\t"
"ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t"
"fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t"
"fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t"
"fmla v31.2d,v3.2d,v7.d[1]\n\t"
"bgt 1b; blt 3f\n\t"
"2:\n\t"
/** tail part with k = 2 */
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
"fmla v12.2d,v0.2d,v5.d[0]\n\t"
"fmla v13.2d,v1.2d,v5.d[0]\n\t"
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
"fmla v18.2d,v0.2d,v6.d[1]\n\t"
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
"fmla v24.2d,v0.2d,v4.d[0]\n\t"
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
"fmla v30.2d,v0.2d,v5.d[1]\n\t"
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
"fmov v4.d[1],x0\n\t"
"fmla v8.2d,v2.2d,v6.d[0]\n\t"
"fmla v9.2d,v3.2d,v6.d[0]\n\t"
"fmla v10.2d,v2.2d,v6.d[1]\n\t"
"ldr d5,[%[b2_],#48]\n\t"
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
"fmla v13.2d,v3.2d,v7.d[0]\n\t"
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
"fmla v15.2d,v3.2d,v7.d[1]\n\t"
"fmla v16.2d,v2.2d,v4.d[0]\n\t"
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
"fmla v19.2d,v3.2d,v4.d[1]\n\t"
"fmov v7.d[1],x0\n\t"
"fmla v20.2d,v2.2d,v5.d[0]\n\t"
"fmla v21.2d,v3.2d,v5.d[0]\n\t"
"fmla v22.2d,v2.2d,v5.d[1]\n\t"
"fmla v23.2d,v3.2d,v5.d[1]\n\t"
"fmla v24.2d,v2.2d,v6.d[0]\n\t"
"fmla v25.2d,v3.2d,v6.d[0]\n\t"
"fmla v26.2d,v2.2d,v6.d[1]\n\t"
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
"fmla v28.2d,v2.2d,v7.d[0]\n\t"
"fmla v29.2d,v3.2d,v7.d[0]\n\t"
"fmla v30.2d,v2.2d,v7.d[1]\n\t"
"fmla v31.2d,v3.2d,v7.d[1]\n\t"
"b 4f\n\t"
"3:\n\t"
/** tail part with k = 1 */
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
"fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t"
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
"fmov v7.d[1],x0\n\t"
"fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t"
"fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t"
"fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t"
"ldr d4,[%[b3_]]\n\t"
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
"fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t"
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
"fmov v5.d[1],x0\n\t"
"fmla v20.2d,v0.2d,v7.d[0]\n\t"
"fmla v21.2d,v1.2d,v7.d[0]\n\t"
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
"fmla v23.2d,v1.2d,v7.d[1]\n\t"
"fmla v24.2d,v0.2d,v4.d[0]\n\t"
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
"fmla v26.2d,v0.2d,v4.d[1]\n\t"
"fmla v27.2d,v1.2d,v4.d[1]\n\t"
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
"fmla v29.2d,v1.2d,v5.d[0]\n\t"
"fmla v30.2d,v0.2d,v5.d[1]\n\t"
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
/** store 4x12 elements to C */
"4:\n\t"
"ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t"
"fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t"
"fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t"
"fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t"
"fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t"
"fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t"
"fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t"
:[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K)
:[LDC]"r"(LDC), [alpha]"m"(alpha)
:"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
/**********************************************************
* Operation:
C[0] += alpha * up[0]; C[1] += alpha * up[1];
C[2] += alpha * down[0]; C[3] += alpha * down[1];
*********************************************************/
static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) {
float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2);
t1 = vfmaq_n_f64(t1, up, alpha);
t2 = vfmaq_n_f64(t2, down, alpha);
vst1q_f64(C, t1);
vst1q_f64(C + 2, t2);
}
/**********************************************************
* Function: dgemm_kernel_arm64_4x4_m4n8
* Operation: C[4][8] += alpha * sa[4][K] * sb[K][8]
* Matrix orders:
* sa: column-major (leading dimension == 4)
* sb: 2 concatenated row-major 4-column submatrices
* C: column-major (leading dimension == LDC)
*********************************************************/
static inline void dgemm_kernel_arm64_4x4_m4n8(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + K * 4;
/** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */
float64x2_t c11, c12, c13, c14, c15, c16, c17, c18;
float64x2_t c21, c22, c23, c24, c25, c26, c27, c28;
c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0);
c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0);
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa);
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(b1_);
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4;
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
float64x2_t b3 = vld1q_f64(b2_);
c15 = vfmaq_laneq_f64(c15, a1, b3, 0);
c25 = vfmaq_laneq_f64(c25, a2, b3, 0);
c16 = vfmaq_laneq_f64(c16, a1, b3, 1);
c26 = vfmaq_laneq_f64(c26, a2, b3, 1);
float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4;
c17 = vfmaq_laneq_f64(c17, a1, b4, 0);
c27 = vfmaq_laneq_f64(c27, a2, b4, 0);
c18 = vfmaq_laneq_f64(c18, a1, b4, 1);
c28 = vfmaq_laneq_f64(c28, a2, b4, 1);
}
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
dgemm_store_m4n1(C, c14, c24, alpha); C += LDC;
dgemm_store_m4n1(C, c15, c25, alpha); C += LDC;
dgemm_store_m4n1(C, c16, c26, alpha); C += LDC;
dgemm_store_m4n1(C, c17, c27, alpha); C += LDC;
dgemm_store_m4n1(C, c18, c28, alpha);
}
/**********************************************************
* Function: dgemm_kernel_arm64_4x4_m4n4
* Operation: C[4][4] += alpha * sa[4][K] * sb[K][4]
* Matrix orders:
* sa: column-major (leading dimension == 4)
* sb: row-major (leading dimension == 4)
* C: column-major (leading dimension == LDC)
*********************************************************/
static inline void dgemm_kernel_arm64_4x4_m4n4(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c11, c21, c12, c22, c13, c23, c14, c24;
c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0);
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa);
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb);
float64x2_t b2 = vld1q_f64(sb + 2); sb += 4;
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
}
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
dgemm_store_m4n1(C, c14, c24, alpha);
}
static inline void dgemm_kernel_arm64_4x4_m4n2(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2;
c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2),
a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8;
c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0);
c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0);
c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1);
c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1);
c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0);
c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0);
c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1);
c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1);
}
c11_1 = vaddq_f64(c11_1, c11_2);
c21_1 = vaddq_f64(c21_1, c21_2);
c12_1 = vaddq_f64(c12_1, c12_2);
c22_1 = vaddq_f64(c22_1, c22_2);
if (K) {
float64x2_t b1 = vld1q_f64(sb); sb += 2;
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0);
c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0);
c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1);
c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1);
}
dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC;
dgemm_store_m4n1(C, c12_1, c22_1, alpha);
}
static inline void dgemm_kernel_arm64_4x4_m4n1(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c11_1, c11_2, c21_1, c21_2;
c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t b1 = vld1q_f64(sb); sb += 2;
c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0);
c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0);
c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1);
c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1);
sa += 8;
}
c11_1 = vaddq_f64(c11_1, c11_2);
c21_1 = vaddq_f64(c21_1, c21_2);
if (K) {
double b1 = *sb++;
c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1);
c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1);
sa += 4;
}
dgemm_store_m4n1(C, c11_1, c21_1, alpha);
}
static inline void dgemm_kernel_arm64_4x4_m2n12(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24;
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 =
c21 = c22 = c23 = c24 = vdupq_n_f64(0);
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + 4 * K;
const FLOAT *b3_ = b2_ + 4 * K;
for (; K; K--) {
const float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4;
c21 = vfmaq_laneq_f64(c21, a1, b1, 0);
c22 = vfmaq_laneq_f64(c22, a1, b1, 1);
c23 = vfmaq_laneq_f64(c23, a1, b2, 0);
c24 = vfmaq_laneq_f64(c24, a1, b2, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n8(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14;
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0);
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + 4 * K;
for (; K; K--) {
const float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n4(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2;
c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2);
float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1);
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0);
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1);
c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0);
c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1);
c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0);
c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1);
}
c1_1 = vaddq_f64(c1_1, c1_2);
c2_1 = vaddq_f64(c2_1, c2_2);
c3_1 = vaddq_f64(c3_1, c3_2);
c4_1 = vaddq_f64(c4_1, c4_2);
if (K) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0);
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n2(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1_1, c1_2, c2_1, c2_2;
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0);
c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1);
}
c1_1 = vaddq_f64(c1_1, c1_2);
c2_1 = vaddq_f64(c2_1, c2_2);
if (K) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(sb); sb += 2;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n1(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
for (; K > 3; K -= 4) {
float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4;
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0);
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1);
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0);
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1);
sa += 8;
}
c1 = vaddq_f64(c1, c2);
c3 = vaddq_f64(c3, c4);
c1 = vaddq_f64(c1, c3);
for (; K; K--) {
c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++);
sa += 2;
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha));
}
static inline void dgemm_store_m1n2(double *C, float64x2_t vc,
double alpha, BLASLONG LDC) {
double c0 = vgetq_lane_f64(vc, 0);
double c1 = vgetq_lane_f64(vc, 1);
C[0] += c0 * alpha;
C[LDC] += c1 * alpha;
}
static inline void dgemm_kernel_arm64_4x4_m1n12(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4, c5, c6;
c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0);
const double *b1_ = sb;
const double *b2_ = sb + 4 * K;
const double *b3_ = b2_ + 4 * K;
for (; K; K--) {
const double a1 = *sa++;
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1);
c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4;
}
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c6, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n8(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
const double *b1_ = sb;
const double *b2_ = sb + 4 * K;
for (; K; K--) {
const double a1 = *sa++;
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
}
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c4, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n4(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1_1, c1_2, c2_1, c2_2;
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0);
c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1);
c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8;
}
c1_1 = vaddq_f64(c1_1, c1_2);
c2_1 = vaddq_f64(c2_1, c2_2);
if (K) {
double a1 = *sa++;
c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1);
c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1);
sb += 4;
}
dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c2_1, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n2(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
for (; K > 3; K -= 4) {
float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4;
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0);
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1);
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0);
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8;
}
c1 = vaddq_f64(c1, c2);
c3 = vaddq_f64(c3, c4);
c1 = vaddq_f64(c1, c3);
for (; K; K--) {
c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++);
sb += 2;
}
dgemm_store_m1n2(C, c1, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n1(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
for (; K > 7; K -= 8) {
c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa));
c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2));
c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4));
c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6));
sa += 8; sb += 8;
}
c1 = vaddq_f64(c1, c2);
c3 = vaddq_f64(c3, c4);
c1 = vaddq_f64(c1, c3);
double cs1 = vpaddd_f64(c1);
for (; K; K--) {
cs1 += (*sa++) * (*sb++);
}
C[0] += cs1 * alpha;
}
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
for (; N >= 12; N -= 12) {
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha);
}
sb += 12 * K;
C += 12 * LDC;
}
if (N >= 8) {
N -= 8;
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha);
}
sb += 8 * K;
C += 8 * LDC;
} else if (N >= 4) {
N -= 4;
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha);
}
sb += 4 * K;
C += 4 * LDC;
}
if (N >= 2) {
N -= 2;
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha);
}
sb += 2 * K;
C += 2 * LDC;
}
if (N) {
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha);
}
}
return 0;
}

View File

@ -0,0 +1,874 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* X0 X1 X2 s0 X3 x4 x5 x6 */
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
#define origM x0
#define origN x1
#define origK x2
#define origPA x3
#define origPB x4
#define pC x5
#define LDC x6
#define temp x7
#define counterL x8
#define counterI x9
#define counterJ x10
#define pB x11
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define lanes x15
#define pA x16
#define alpha x17
#define alpha0 d10
#define alphaZ z2.d
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
// 02 origK
// 03 origPA
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 temp
// 08 counterL
// 09 counterI
// 10 counterJ
// 11 pB
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 lanes
// 16 pA
// 17
// 18 must save
// 19 must save
// 20 must save
// 21 must save
// 22 must save
// 23 must save
// 24 must save
// 25 must save
// 26 must save
// 27 must save
// 28 must save
// 29 frame
// 30 link
// 31 sp
//v00 ALPHA -> pA0_0
//v01 pA0_1
//v02 ALPHA0
//v03
//v04
//v05
//v06
//v07
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2
//v11 must save pB0_3
//v12 must save pB0_4
//v13 must save pB0_5
//v14 must save pB0_6
//v15 must save pB0_7
//v16 must save C0
//v17 must save C1
//v18 must save C2
//v19 must save C3
//v20 must save C4
//v21 must save C5
//v22 must save C6
//v23 must save C7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
.macro INITv1x8
dup z16.d, #0
dup z17.d, #0
dup z18.d, #0
dup z19.d, #0
dup z20.d, #0
dup z21.d, #0
dup z22.d, #0
dup z23.d, #0
.endm
.macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
ld1rd z12.d, p0/z, [pB, 32]
ld1rd z13.d, p0/z, [pB, 40]
ld1rd z14.d, p0/z, [pB, 48]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z0.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z0.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_M1
ld1d z1.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z0.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z0.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_M2
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z1.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z1.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z1.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z1.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z1.d, z12.d
ld1rd z12.d, p0/z, [pB, 32]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z1.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z1.d, z15.d
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_E
fmla z16.d, p1/m, z1.d, z8.d
fmla z17.d, p1/m, z1.d, z9.d
fmla z18.d, p1/m, z1.d, z10.d
fmla z19.d, p1/m, z1.d, z11.d
fmla z20.d, p1/m, z1.d, z12.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
fmla z22.d, p1/m, z1.d, z14.d
fmla z23.d, p1/m, z1.d, z15.d
.endm
.macro KERNELv1x8_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
ld1rd z12.d, p0/z, [pB, 32]
ld1rd z13.d, p0/z, [pB, 40]
ld1rd z14.d, p0/z, [pB, 48]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
fmla z18.d, p1/m, z0.d, z10.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z19.d, p1/m, z0.d, z11.d
fmla z20.d, p1/m, z0.d, z12.d
fmla z21.d, p1/m, z0.d, z13.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z22.d, p1/m, z0.d, z14.d
fmla z23.d, p1/m, z0.d, z15.d
.endm
.macro SAVEv1x8
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z28.d, p1/z, [pCRow2]
fmla z28.d, p1/m, z20.d, alphaZ
st1d z28.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z29.d, p1/z, [pCRow1]
fmla z29.d, p1/m, z21.d, alphaZ
st1d z29.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z30.d, p1/z, [pCRow2]
fmla z30.d, p1/m, z22.d, alphaZ
st1d z30.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z31.d, p1/z, [pCRow1]
fmla z31.d, p1/m, z23.d, alphaZ
st1d z31.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x4
dup z16.d, #0
dup z17.d, #0
dup z18.d, #0
dup z19.d, #0
.endm
.macro KERNELv1x4_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
add pB, pB, 32
fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z18.d, p1/m, z0.d, z10.d
fmla z19.d, p1/m, z0.d, z11.d
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x2
dup z16.d, #0
dup z17.d, #0
.endm
.macro KERNELv1x2_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
add pB, pB, 16
fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z17.d, p1/m, z0.d, z9.d
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x1
dup z16.d, #0
.endm
.macro KERNELv1x1_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
add pB, pB, 8
fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
.align 5
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0
dup alphaZ, alpha
lsl LDC, LDC, #3 // ldc = ldc * 8
ptrue p0.d // create true predicate
mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC
add pC, pC, LDC, lsl #3 // add 8 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L8_Mv1_BEGIN:
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0
whilelt p1.d, counterI, origM
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
.align 5
.Ldgemm_kernel_L8_Mv1_20:
mov pB, origPB
INITv1x8 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt .Ldgemm_kernel_L8_Mv1_32
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #2 // subtract 2
ble .Ldgemm_kernel_L8_Mv1_22a
.align 5
.Ldgemm_kernel_L8_Mv1_22:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L8_Mv1_22
.align 5
.Ldgemm_kernel_L8_Mv1_22a:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.align 5
.Ldgemm_kernel_L8_Mv1_32:
tst counterL, #1
ble .Ldgemm_kernel_L8_Mv1_40
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.Ldgemm_kernel_L8_Mv1_40:
INITv1x8
.Ldgemm_kernel_L8_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L8_Mv1_100
.align 5
.Ldgemm_kernel_L8_Mv1_46:
KERNELv1x8_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L8_Mv1_46
.Ldgemm_kernel_L8_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x8
.Ldgemm_kernel_L8_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
b.any .Ldgemm_kernel_L8_Mv1_20
.Ldgemm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
subs counterJ, counterJ , #1 // j--
bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
/* Repeat the same thing if 4 left in N */
.align 5
.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #4
ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #2 // add 4 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L4_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L4_Mv1_20:
mov pB, origPB
INITv1x4 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L4_Mv1_44
.align 5
.Ldgemm_kernel_L4_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L4_Mv1_22
.Ldgemm_kernel_L4_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L4_Mv1_100
.align 5
.Ldgemm_kernel_L4_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L4_Mv1_46
.Ldgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Ldgemm_kernel_L4_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L4_Mv1_20
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/
/* Repeat the same thing if 2 left in N */
.align 5
.Ldgemm_kernel_L2_BEGIN:
mov counterJ , origN
tst counterJ , #2
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #1 // add 2 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L2_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L2_Mv1_20:
mov pB, origPB
INITv1x2 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L2_Mv1_44
.align 5
.Ldgemm_kernel_L2_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L2_Mv1_22
.Ldgemm_kernel_L2_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L2_Mv1_100
.align 5
.Ldgemm_kernel_L2_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L2_Mv1_46
.Ldgemm_kernel_L2_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x2
.Ldgemm_kernel_L2_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L2_Mv1_20
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
/* Repeat the same thing if 1 left in N */
.align 5
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC
add pC, pC, LDC // add 1 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L1_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L1_Mv1_20:
mov pB, origPB
INITv1x1 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 8 to do?
ble .Ldgemm_kernel_L1_Mv1_44
.align 5
.Ldgemm_kernel_L1_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_22
.Ldgemm_kernel_L1_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L1_Mv1_100
.align 5
.Ldgemm_kernel_L1_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_46
.Ldgemm_kernel_L1_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x1
.Ldgemm_kernel_L1_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L1_Mv1_20
.Ldgemm_kernel_L1_END:
/******************************************************************************/
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,79 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -0,0 +1,77 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,874 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* X0 X1 X2 s0 X3 x4 x5 x6 */
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
#define origM x0
#define origN x1
#define origK x2
#define origPA x3
#define origPB x4
#define pC x5
#define LDC x6
#define temp x7
#define counterL x8
#define counterI x9
#define counterJ x10
#define pB x11
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define lanes x15
#define pA x16
#define alpha w17
#define alpha0 s10
#define alphaZ z2.s
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
// 02 origK
// 03 origPA
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 temp
// 08 counterL
// 09 counterI
// 10 counterJ
// 11 pB
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 lanes
// 16 pA
// 17
// 18 must save
// 19 must save
// 20 must save
// 21 must save
// 22 must save
// 23 must save
// 24 must save
// 25 must save
// 26 must save
// 27 must save
// 28 must save
// 29 frame
// 30 link
// 31 sp
//v00 ALPHA -> pA0_0
//v01 pA0_1
//v02 ALPHA0
//v03
//v04
//v05
//v06
//v07
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2
//v11 must save pB0_3
//v12 must save pB0_4
//v13 must save pB0_5
//v14 must save pB0_6
//v15 must save pB0_7
//v16 must save C0
//v17 must save C1
//v18 must save C2
//v19 must save C3
//v20 must save C4
//v21 must save C5
//v22 must save C6
//v23 must save C7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
.macro INITv1x8
dup z16.s, #0
dup z17.s, #0
dup z18.s, #0
dup z19.s, #0
dup z20.s, #0
dup z21.s, #0
dup z22.s, #0
dup z23.s, #0
.endm
.macro KERNELv1x8_I
ld1w z0.s, p1/z, [pA]
ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
ld1rw z10.s, p0/z, [pB, 8]
ld1rw z11.s, p0/z, [pB, 12]
ld1rw z12.s, p0/z, [pB, 16]
ld1rw z13.s, p0/z, [pB, 20]
ld1rw z14.s, p0/z, [pB, 24]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
fmla z16.s, p1/m, z0.s, z8.s
ld1rw z8.s, p0/z, [pB]
fmla z17.s, p1/m, z0.s, z9.s
ld1rw z9.s, p0/z, [pB, 4]
fmla z18.s, p1/m, z0.s, z10.s
ld1rw z10.s, p0/z, [pB, 8]
fmla z19.s, p1/m, z0.s, z11.s
ld1rw z11.s, p0/z, [pB, 12]
fmla z20.s, p1/m, z0.s, z12.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rw z12.s, p0/z, [pB, 16]
fmla z21.s, p1/m, z0.s, z13.s
ld1rw z13.s, p0/z, [pB, 20]
fmla z22.s, p1/m, z0.s, z14.s
ld1rw z14.s, p0/z, [pB, 24]
fmla z23.s, p1/m, z0.s, z15.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
.endm
.macro KERNELv1x8_M1
ld1w z1.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
fmla z16.s, p1/m, z0.s, z8.s
ld1rw z8.s, p0/z, [pB]
fmla z17.s, p1/m, z0.s, z9.s
ld1rw z9.s, p0/z, [pB, 4]
fmla z18.s, p1/m, z0.s, z10.s
ld1rw z10.s, p0/z, [pB, 8]
fmla z19.s, p1/m, z0.s, z11.s
ld1rw z11.s, p0/z, [pB, 12]
fmla z20.s, p1/m, z0.s, z12.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rw z12.s, p0/z, [pB, 16]
fmla z21.s, p1/m, z0.s, z13.s
ld1rw z13.s, p0/z, [pB, 20]
fmla z22.s, p1/m, z0.s, z14.s
ld1rw z14.s, p0/z, [pB, 24]
fmla z23.s, p1/m, z0.s, z15.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
.endm
.macro KERNELv1x8_M2
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
fmla z16.s, p1/m, z1.s, z8.s
ld1rw z8.s, p0/z, [pB]
fmla z17.s, p1/m, z1.s, z9.s
ld1rw z9.s, p0/z, [pB, 4]
fmla z18.s, p1/m, z1.s, z10.s
ld1rw z10.s, p0/z, [pB, 8]
fmla z19.s, p1/m, z1.s, z11.s
ld1rw z11.s, p0/z, [pB, 12]
fmla z20.s, p1/m, z1.s, z12.s
ld1rw z12.s, p0/z, [pB, 16]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.s, p1/m, z1.s, z13.s
ld1rw z13.s, p0/z, [pB, 20]
fmla z22.s, p1/m, z1.s, z14.s
ld1rw z14.s, p0/z, [pB, 24]
fmla z23.s, p1/m, z1.s, z15.s
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
.endm
.macro KERNELv1x8_E
fmla z16.s, p1/m, z1.s, z8.s
fmla z17.s, p1/m, z1.s, z9.s
fmla z18.s, p1/m, z1.s, z10.s
fmla z19.s, p1/m, z1.s, z11.s
fmla z20.s, p1/m, z1.s, z12.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.s, p1/m, z1.s, z13.s
fmla z22.s, p1/m, z1.s, z14.s
fmla z23.s, p1/m, z1.s, z15.s
.endm
.macro KERNELv1x8_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
ld1rw z10.s, p0/z, [pB, 8]
ld1rw z11.s, p0/z, [pB, 12]
ld1rw z12.s, p0/z, [pB, 16]
ld1rw z13.s, p0/z, [pB, 20]
ld1rw z14.s, p0/z, [pB, 24]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
fmla z16.s, p1/m, z0.s, z8.s
fmla z17.s, p1/m, z0.s, z9.s
fmla z18.s, p1/m, z0.s, z10.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z19.s, p1/m, z0.s, z11.s
fmla z20.s, p1/m, z0.s, z12.s
fmla z21.s, p1/m, z0.s, z13.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z22.s, p1/m, z0.s, z14.s
fmla z23.s, p1/m, z0.s, z15.s
.endm
.macro SAVEv1x8
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z25.s, p1/z, [pCRow1]
fmla z25.s, p1/m, z17.s, alphaZ
st1w z25.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z26.s, p1/z, [pCRow2]
fmla z26.s, p1/m, z18.s, alphaZ
st1w z26.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z27.s, p1/z, [pCRow1]
fmla z27.s, p1/m, z19.s, alphaZ
st1w z27.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z28.s, p1/z, [pCRow2]
fmla z28.s, p1/m, z20.s, alphaZ
st1w z28.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z29.s, p1/z, [pCRow1]
fmla z29.s, p1/m, z21.s, alphaZ
st1w z29.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z30.s, p1/z, [pCRow2]
fmla z30.s, p1/m, z22.s, alphaZ
st1w z30.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1w z31.s, p1/z, [pCRow1]
fmla z31.s, p1/m, z23.s, alphaZ
st1w z31.s, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/******************************************************************************/
.macro INITv1x4
dup z16.s, #0
dup z17.s, #0
dup z18.s, #0
dup z19.s, #0
.endm
.macro KERNELv1x4_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
ld1rw z10.s, p0/z, [pB, 8]
ld1rw z11.s, p0/z, [pB, 12]
add pB, pB, 16
fmla z16.s, p1/m, z0.s, z8.s
fmla z17.s, p1/m, z0.s, z9.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z18.s, p1/m, z0.s, z10.s
fmla z19.s, p1/m, z0.s, z11.s
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z25.s, p1/z, [pCRow1]
fmla z25.s, p1/m, z17.s, alphaZ
st1w z25.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z26.s, p1/z, [pCRow2]
fmla z26.s, p1/m, z18.s, alphaZ
st1w z26.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1w z27.s, p1/z, [pCRow1]
fmla z27.s, p1/m, z19.s, alphaZ
st1w z27.s, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/******************************************************************************/
.macro INITv1x2
dup z16.s, #0
dup z17.s, #0
.endm
.macro KERNELv1x2_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
add pB, pB, 8
fmla z16.s, p1/m, z0.s, z8.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z17.s, p1/m, z0.s, z9.s
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1w z25.s, p1/z, [pCRow1]
fmla z25.s, p1/m, z17.s, alphaZ
st1w z25.s, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/******************************************************************************/
.macro INITv1x1
dup z16.s, #0
.endm
.macro KERNELv1x1_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8
ld1rw z8.s, p0/z, [pB]
add pB, pB, 4
fmla z16.s, p1/m, z0.s, z8.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
.align 5
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, s0
dup alphaZ, alpha
lsl LDC, LDC, #2 // ldc = ldc * 4
ptrue p0.s // create true predicate
mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC
add pC, pC, LDC, lsl #3 // add 8 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L8_Mv1_BEGIN:
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0
whilelt p1.s, counterI, origM
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
.align 5
.Ldgemm_kernel_L8_Mv1_20:
mov pB, origPB
INITv1x8 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt .Ldgemm_kernel_L8_Mv1_32
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #2 // subtract 2
ble .Ldgemm_kernel_L8_Mv1_22a
.align 5
.Ldgemm_kernel_L8_Mv1_22:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L8_Mv1_22
.align 5
.Ldgemm_kernel_L8_Mv1_22a:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.align 5
.Ldgemm_kernel_L8_Mv1_32:
tst counterL, #1
ble .Ldgemm_kernel_L8_Mv1_40
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.Ldgemm_kernel_L8_Mv1_40:
INITv1x8
.Ldgemm_kernel_L8_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L8_Mv1_100
.align 5
.Ldgemm_kernel_L8_Mv1_46:
KERNELv1x8_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L8_Mv1_46
.Ldgemm_kernel_L8_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x8
.Ldgemm_kernel_L8_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
b.any .Ldgemm_kernel_L8_Mv1_20
.Ldgemm_kernel_L8_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 8 * 4
subs counterJ, counterJ , #1 // j--
bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
/* Repeat the same thing if 4 left in N */
.align 5
.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #4
ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #2 // add 4 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L4_Mv1_BEGIN:
mov counterI, #0
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
.align 5
.Ldgemm_kernel_L4_Mv1_20:
mov pB, origPB
INITv1x4 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L4_Mv1_44
.align 5
.Ldgemm_kernel_L4_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L4_Mv1_22
.Ldgemm_kernel_L4_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L4_Mv1_100
.align 5
.Ldgemm_kernel_L4_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L4_Mv1_46
.Ldgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Ldgemm_kernel_L4_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
b.any .Ldgemm_kernel_L4_Mv1_20
.Ldgemm_kernel_L4_END:
lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4
/******************************************************************************/
/* Repeat the same thing if 2 left in N */
.align 5
.Ldgemm_kernel_L2_BEGIN:
mov counterJ , origN
tst counterJ , #2
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #1 // add 2 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L2_Mv1_BEGIN:
mov counterI, #0
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
.align 5
.Ldgemm_kernel_L2_Mv1_20:
mov pB, origPB
INITv1x2 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L2_Mv1_44
.align 5
.Ldgemm_kernel_L2_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L2_Mv1_22
.Ldgemm_kernel_L2_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L2_Mv1_100
.align 5
.Ldgemm_kernel_L2_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L2_Mv1_46
.Ldgemm_kernel_L2_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x2
.Ldgemm_kernel_L2_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
b.any .Ldgemm_kernel_L2_Mv1_20
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
/* Repeat the same thing if 1 left in N */
.align 5
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC
add pC, pC, LDC // add 1 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L1_Mv1_BEGIN:
mov counterI, #0
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
.align 5
.Ldgemm_kernel_L1_Mv1_20:
mov pB, origPB
INITv1x1 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 8 to do?
ble .Ldgemm_kernel_L1_Mv1_44
.align 5
.Ldgemm_kernel_L1_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_22
.Ldgemm_kernel_L1_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L1_Mv1_100
.align 5
.Ldgemm_kernel_L1_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_46
.Ldgemm_kernel_L1_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x1
.Ldgemm_kernel_L1_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
b.any .Ldgemm_kernel_L1_Mv1_20
.Ldgemm_kernel_L1_END:
/******************************************************************************/
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,78 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint32_t lda_vec = svindex_s32(0LL, lda);
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

View File

@ -0,0 +1,77 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,143 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, offset;
#if defined(DOUBLE)
uint64_t sve_size = svcntd();
svint64_t posY_vec = svdup_s64(posY);
svint64_t posX_vec = svdup_s64(posX);
svint64_t lda_vec = svdup_s64(lda);
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
do {
offset = posX - posY;
svint64_t vec_off = svdup_s64(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint64_t temp = svadd_z(pg, posX_vec, index);
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint64_t gat_ind = svsel(cmp, temp1, temp2);
i = m;
while (i>0) {
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
#else
uint32_t sve_size = svcntw();
svint32_t posY_vec = svdup_s32(posY);
svint32_t posX_vec = svdup_s32(posX);
svint32_t lda_vec = svdup_s32(lda);
svint32_t one_vec = svdup_s32(1);
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
do {
offset = posX - posY;
svint32_t vec_off = svdup_s32(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint32_t temp = svadd_z(pg, posX_vec, index);
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint32_t gat_ind = svsel(cmp, temp1, temp2);
i = m;
while (i>0) {
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
#endif
return 0;
}

View File

@ -0,0 +1,143 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, offset;
#if defined(DOUBLE)
uint64_t sve_size = svcntd();
svint64_t posY_vec = svdup_s64(posY);
svint64_t posX_vec = svdup_s64(posX);
svint64_t lda_vec = svdup_s64(lda);
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
do {
offset = posX - posY;
svint64_t vec_off = svdup_s64(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint64_t temp = svadd_z(pg, posX_vec, index);
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint64_t gat_ind = svsel(cmp, temp2, temp1);
i = m;
while (i>0) {
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, one_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
#else
uint32_t sve_size = svcntw();
svint32_t posY_vec = svdup_s32(posY);
svint32_t posX_vec = svdup_s32(posX);
svint32_t lda_vec = svdup_s32(lda);
svint32_t one_vec = svdup_s32(1);
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
do {
offset = posX - posY;
svint32_t vec_off = svdup_s32(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint32_t temp = svadd_z(pg, posX_vec, index);
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint32_t gat_ind = svsel(cmp, temp2, temp1);
i = m;
while (i>0) {
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, one_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
#endif
return 0;
}

View File

@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
js = 0;
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posY + posX * lda;
} else {
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
#else
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
#endif
svst1(pn, b, aj_vec);
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X < posY) {
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = *(ao+k*lda+j);
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k <= j; k++) {
b[temp++] = *(ao+k*lda+j);
}
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#endif
ao += n_active;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posY + posX * lda;
} else {
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY) {
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X < posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1(pn, ao);
#else
svfloat32_t aj_vec = svld1(pn, ao);
#endif
svst1(pn, b, aj_vec);
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = *(ao+j*lda+k);
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
for (int k = j; k < n_active; k++) {
b[temp++] = *(ao+j*lda+k);
}
}
#endif
ao += n_active * lda;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
js = 0;
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posX + posY * lda;
} else {
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
#else
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
#endif
svst1(pn, b, aj_vec);
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X > posY) {
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = *(ao+k*lda+j);
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
for (int k = j; k < n_active; k++) {
b[temp++] = *(ao+k*lda+j);
}
}
#endif
ao += n_active;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,134 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posX + posY * lda;
} else {
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY) {
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X > posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1(pn, ao);
#else
svfloat32_t aj_vec = svld1(pn, ao);
#endif
svst1(pn, b, aj_vec);
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = *(ao+j*lda+k);
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k <= j; k++) {
b[temp++] = *(ao+j*lda+k);
}
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#endif
ao += n_active * lda;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,736 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
/*******************************************************************************
The complex GEMM kernels in OpenBLAS use static configuration of conjugation
modes via specific macros:
MACRO_NAME | conjugation on matrix A | conjugation on matrix B |
---------- | ----------------------- | ----------------------- |
NN/NT/TN/TT | No | No |
NR/NC/TR/TC | No | Yes |
RN/RT/CN/CT | Yes | No |
RR/RC/CR/CC | Yes | Yes |
"conjugation on matrix A" means the complex conjugates of elements from
matrix A are used for matmul (rather than the original elements). "conjugation
on matrix B" means the complex conjugate of each element from matrix B is taken
for matrix multiplication, respectively.
Complex numbers in arrays or matrices are usually packed together as an
array of struct (without padding):
struct complex_number {
FLOAT real_part;
FLOAT imag_part;
};
For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF
DOUBLE, the real part of its Kth complex number can be accessed as
ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1].
This file uses 2 ways to vectorize matrix multiplication of complex numbers:
(1) Expanded-form
During accumulation along direction K:
Σk(a[0][k].real b[k][n].real)
accumulate Σk(a[0][k].imag b[k][n].real)
-------------------> .
| * b[k][n].real .
| (broadcasted) .
a[0][k].real Σk(a[v-1][k].real b[k][n].real)
a[0][k].imag Σk(a[v-1][k].imag b[k][n].real)
. VECTOR I
(vec_a) .
.
a[v-1][k].real Σk(a[0][k].real b[k][n].imag)
a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag)
| .
| accumulate .
-------------------> .
* b[k][n].imag Σk(a[v-1][k].real b[k][n].imag)
(broadcasted) Σk(a[v-1][k].imag b[k][n].imag)
VECTOR II
After accumulation, prior to storage:
-1 -Σk(a[0][k].imag b[k][n].imag)
1 Σk(a[0][k].real b[k][n].imag)
. .
VECTOR II permute and multiply . to get .
. .
-1 -Σk(a[v-1][k].imag b[k][n].imag)
1 Σk(a[v-1][k].real b[k][n].imag)
then add with VECTOR I to get the result vector of elements of C.
2 vector registers are needed for every v elements of C, with
v == sizeof(vector) / sizeof(complex)
(2) Contracted-form
During accumulation along direction K:
(the K coordinate is not shown, since the operation is identical for each k)
(load vector in mem) (load vector in mem)
a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i
| |
| unzip operation (or VLD2 in arm neon) |
-----------------------------------------------------
|
|
--------------------------------------------------
| |
| |
v v
a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag
| | | |
| | * b[i].imag(broadcast) | |
* b[i].real | -----------------------------|---- | * b[i].real
(broadcast) | | | | (broadcast)
| ------------------------------ | |
+ | - | * b[i].imag(broadcast) + | + |
v v v v
(accumulate) (accumulate)
c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag
VECTOR_REAL VECTOR_IMAG
After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved)
then stored to matrix C directly.
For 2v elements of C, only 2 vector registers are needed, while
4 registers are required for expanded-form.
(v == sizeof(vector) / sizeof(complex))
For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers
to store elements of C when using expanded-form calculation, where
the register spilling will occur. So contracted-form operation is
selected for 4x4 kernel. As for all other combinations of unroll parameters
(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more
NEON registers into usage to hide latency of multiply-add instructions.
******************************************************************************/
static inline float64x2_t set_f64x2(double lo, double hi) {
float64x2_t ret = vdupq_n_f64(0);
ret = vsetq_lane_f64(lo, ret, 0);
ret = vsetq_lane_f64(hi, ret, 1);
return ret;
}
static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) {
float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }};
return ret;
}
/*****************************************************************
* operation: *c += alpha * c_value //complex multiplication
* expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r }
* expanded_c: {{ arbr, aibr }, { arbi, aibi }}
****************************************************************/
static inline void store_1c(double *c, float64x2x2_t expanded_c,
float64x2x2_t expanded_alpha) {
float64x2_t ld = vld1q_f64(c);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
#else
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
#endif
ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real);
vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag));
}
static inline void pref_c_4(const double *c) {
__asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):);
}
static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) {
float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]),
vaddq_f64(ec1.val[1], ec2.val[1]) }};
return ret;
}
static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) {
float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }};
return ret;
}
static inline float64x2x2_t init() {
float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }};
return ret;
}
static inline void kernel_1x1(const double *sa, const double *sb, double *C,
BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K > 3; K -= 4) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b2);
c3 = update_ec(c3, a3, b3);
c4 = update_ec(c4, a4, b4);
}
c1 = add_ec(c1, c2);
c3 = add_ec(c3, c4);
c1 = add_ec(c1, c3);
for (; K; K--) {
c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2;
}
store_1c(C, c1, expanded_alpha);
}
static inline void kernel_2x1(const double *sa, const double *sb, double *C,
BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a3, b2);
c4 = update_ec(c4, a4, b2);
}
c1 = add_ec(c1, c3);
c2 = add_ec(c2, c4);
if (K) {
float64x2_t b1 = vld1q_f64(sb);
c1 = update_ec(c1, vld1q_f64(sa), b1);
c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha);
}
static inline void kernel_1x2(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a1, b2);
c3 = update_ec(c3, a2, b3);
c4 = update_ec(c4, a2, b4);
}
c1 = add_ec(c1, c3);
c2 = add_ec(c2, c4);
if (K) {
float64x2_t a1 = vld1q_f64(sa);
c1 = update_ec(c1, a1, vld1q_f64(sb));
c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
}
store_1c(C, c1, expanded_alpha);
store_1c(C + LDC * 2, c2, expanded_alpha);
}
static inline void kernel_2x2(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a1, b2);
c4 = update_ec(c4, a2, b2);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
store_1c(C, c3, expanded_alpha);
store_1c(C + 2, c4, expanded_alpha);
}
static inline void kernel_4x1(const double *sa, const double *sb, double *C,
BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
pref_c_4(C);
for (; K; K--) {
float64x2_t b1 = vld1q_f64(sb); sb += 2;
c1 = update_ec(c1, vld1q_f64(sa), b1);
c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
c3 = update_ec(c3, vld1q_f64(sa + 4), b1);
c4 = update_ec(c4, vld1q_f64(sa + 6), b1);
sa += 8;
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha);
store_1c(C + 4, c3, expanded_alpha);
store_1c(C + 6, c4, expanded_alpha);
}
static inline void kernel_4x2(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
pref_c_4(C);
pref_c_4(C + LDC * 2);
for (; K; K--) {
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a3, b1);
c4 = update_ec(c4, a4, b1);
c5 = update_ec(c5, a1, b2);
c6 = update_ec(c6, a2, b2);
c7 = update_ec(c7, a3, b2);
c8 = update_ec(c8, a4, b2);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha);
store_1c(C + 4, c3, expanded_alpha);
store_1c(C + 6, c4, expanded_alpha); C += LDC * 2;
store_1c(C, c5, expanded_alpha);
store_1c(C + 2, c6, expanded_alpha);
store_1c(C + 4, c7, expanded_alpha);
store_1c(C + 6, c8, expanded_alpha);
}
static inline void kernel_1x4(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
c1 = update_ec(c1, a1, vld1q_f64(sb));
c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
c3 = update_ec(c3, a1, vld1q_f64(sb + 4));
c4 = update_ec(c4, a1, vld1q_f64(sb + 6));
sb += 8;
}
store_1c(C, c1, expanded_alpha); C += LDC * 2;
store_1c(C, c2, expanded_alpha); C += LDC * 2;
store_1c(C, c3, expanded_alpha); C += LDC * 2;
store_1c(C, c4, expanded_alpha);
}
static inline void kernel_2x4(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a1, b2);
c4 = update_ec(c4, a2, b2);
c5 = update_ec(c5, a1, b3);
c6 = update_ec(c6, a2, b3);
c7 = update_ec(c7, a1, b4);
c8 = update_ec(c8, a2, b4);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
store_1c(C, c3, expanded_alpha);
store_1c(C + 2, c4, expanded_alpha); C += LDC * 2;
store_1c(C, c5, expanded_alpha);
store_1c(C + 2, c6, expanded_alpha); C += LDC * 2;
store_1c(C, c7, expanded_alpha);
store_1c(C + 2, c8, expanded_alpha);
}
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmla "
#define FMLA_II "fmls "
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define FMLA_RI "fmls "
#define FMLA_IR "fmla "
#define FMLA_II "fmla "
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmls "
#define FMLA_II "fmla "
#else
#define FMLA_RI "fmls "
#define FMLA_IR "fmls "
#define FMLA_II "fmls "
#endif
#define FMLA_RR "fmla "
static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i,
float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) {
float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4);
up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar);
up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai);
lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar);
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai);
up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai);
up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar);
lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai);
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar);
vst2q_f64(C, up);
vst2q_f64(C + 4, lo);
}
static inline void kernel_4x4(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
float64x2_t c1r, c1i, c2r, c2i;
float64x2_t c3r, c3i, c4r, c4i;
float64x2_t c5r, c5i, c6r, c6i;
float64x2_t c7r, c7i, c8r, c8i;
const double *pref_ = C;
pref_c_4(pref_); pref_ += LDC * 2;
pref_c_4(pref_); pref_ += LDC * 2;
pref_c_4(pref_); pref_ += LDC * 2;
pref_c_4(pref_);
__asm__ __volatile__(
"cmp %[K],#0\n\t"
"movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t"
"movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t"
"movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t"
"movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
"beq 4f; cmp %[K],#2\n\t"
"ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t"
"ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t"
"ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t"
"beq 2f; blt 3f\n\t"
"1:\n\t"
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
"fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t"
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t"
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t"
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t"
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
"fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t"
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t"
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
"fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t"
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t"
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
"fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t"
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t"
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
"fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t"
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t"
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
"fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t"
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t"
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
"fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t"
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t"
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t"
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t"
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t"
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t"
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t"
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t"
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t"
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t"
"2:\n\t"
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
"fmov v15.d[1],x0\n\t"
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t"
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t"
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t"
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t"
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t"
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t"
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t"
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t"
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t"
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t"
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t"
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t"
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t"
"3:\n\t"
"fmov v7.d[1],x0\n\t"
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t"
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t"
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t"
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t"
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t"
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t"
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t"
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t"
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t"
"4:\n\t"
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb)
::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2;
store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2;
store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2;
store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai);
}
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
BLASLONG n_left = N;
for (; n_left >= 4; n_left -= 4) {
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 4; m_left -= 4) {
kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai);
}
sb += 8 * K;
C += 8 * LDC;
}
if (n_left >= 2) {
n_left -= 2;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 4; m_left -= 4) {
kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai);
}
sb += 4 * K;
C += 4 * LDC;
}
if (n_left) {
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 4; m_left -= 4) {
kernel_4x1(a_, sb, c_, K, alphar, alphai);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x1(a_, sb, c_, K, alphar, alphai);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x1(a_, sb, c_, K, alphar, alphai);
}
}
return 0;
}

160
kernel/mips/KERNEL.generic Normal file
View File

@ -0,0 +1,160 @@
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Pure C for other kernels
SAMAXKERNEL = ../mips/amax.c
DAMAXKERNEL = ../mips/amax.c
CAMAXKERNEL = ../mips/zamax.c
ZAMAXKERNEL = ../mips/zamax.c
SAMINKERNEL = ../mips/amin.c
DAMINKERNEL = ../mips/amin.c
CAMINKERNEL = ../mips/zamin.c
ZAMINKERNEL = ../mips/zamin.c
SMAXKERNEL = ../mips/max.c
DMAXKERNEL = ../mips/max.c
SMINKERNEL = ../mips/min.c
DMINKERNEL = ../mips/min.c
ISAMAXKERNEL = ../mips/iamax.c
IDAMAXKERNEL = ../mips/iamax.c
ICAMAXKERNEL = ../mips/izamax.c
IZAMAXKERNEL = ../mips/izamax.c
ISAMINKERNEL = ../mips/iamin.c
IDAMINKERNEL = ../mips/iamin.c
ICAMINKERNEL = ../mips/izamin.c
IZAMINKERNEL = ../mips/izamin.c
ISMAXKERNEL = ../mips/imax.c
IDMAXKERNEL = ../mips/imax.c
ISMINKERNEL = ../mips/imin.c
IDMINKERNEL = ../mips/imin.c
SASUMKERNEL = ../mips/asum.c
DASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/zasum.c
ZASUMKERNEL = ../mips/zasum.c
SSUMKERNEL = ../mips/sum.c
DSUMKERNEL = ../mips/sum.c
CSUMKERNEL = ../mips/zsum.c
ZSUMKERNEL = ../mips/zsum.c
SAXPYKERNEL = ../mips/axpy.c
DAXPYKERNEL = ../mips/axpy.c
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
SCOPYKERNEL = ../mips/copy.c
DCOPYKERNEL = ../mips/copy.c
CCOPYKERNEL = ../mips/zcopy.c
ZCOPYKERNEL = ../mips/zcopy.c
SDOTKERNEL = ../mips/dot.c
DDOTKERNEL = ../mips/dot.c
CDOTKERNEL = ../mips/zdot.c
ZDOTKERNEL = ../mips/zdot.c
SNRM2KERNEL = ../mips/nrm2.c
DNRM2KERNEL = ../mips/nrm2.c
CNRM2KERNEL = ../mips/znrm2.c
ZNRM2KERNEL = ../mips/znrm2.c
SROTKERNEL = ../mips/rot.c
DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
SSCALKERNEL = ../mips/scal.c
DSCALKERNEL = ../mips/scal.c
CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
SSWAPKERNEL = ../mips/swap.c
DSWAPKERNEL = ../mips/swap.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
SGEMVNKERNEL = ../mips/gemv_n.c
DGEMVNKERNEL = ../mips/gemv_n.c
CGEMVNKERNEL = ../mips/zgemv_n.c
ZGEMVNKERNEL = ../mips/zgemv_n.c
SGEMVTKERNEL = ../mips/gemv_t.c
DGEMVTKERNEL = ../mips/gemv_t.c
CGEMVTKERNEL = ../mips/zgemv_t.c
ZGEMVTKERNEL = ../mips/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

View File

@ -1,7 +1,6 @@
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
ifeq ($(HAVE_GAS), 1)
include $(KERNELDIR)/KERNEL.POWER8
else
#SGEMM_BETA = ../generic/gemm_beta.c
#DGEMM_BETA = ../generic/gemm_beta.c
#CGEMM_BETA = ../generic/zgemm_beta.c
@ -33,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c
SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c
SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c
SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c
SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c
SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c
SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c
SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c
DGEMMKERNEL = dgemm_kernel_power10.c
DGEMMINCOPY =
DGEMMITCOPY =
@ -43,7 +52,18 @@ DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c
DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c
DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c
DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c
DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c
DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
CGEMMKERNEL = cgemm_kernel_power10.S
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
@ -218,5 +238,4 @@ QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
endif

View File

@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
#endif
const float *mvecp = mvec;
/* We have to load reverse mask for big endian. */
/* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
long ytmp;
__asm__
@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%4) \n\t"
"stxv 49, 16(%4) \n\t"
"stxv 50, 32(%4) \n\t"
"stxv 51, 48(%4) \n\t"
"stxv 34, 64(%4) \n\t"
"stxv 35, 80(%4) \n\t"
"stxv 38, 96(%4) \n\t"
"stxv 39, 112(%4) \n\t"
#else
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
#endif
"addi %4, %4, 128 \n\t"
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%4) \n\t"
"stxv 49, 16(%4) \n\t"
"stxv 50, 32(%4) \n\t"
"stxv 51, 48(%4) \n\t"
"stxv 34, 64(%4) \n\t"
"stxv 35, 80(%4) \n\t"
"stxv 38, 96(%4) \n\t"
"stxv 39, 112(%4) \n\t"
#else
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
#endif
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
:

View File

@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
".align 5 \n"
"one%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
#endif
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
#endif
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"bgt one%= \n"
"two%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
#endif
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#include "common.h"
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
#include "cdot_microk_power10.c"
#else
#ifndef HAVE_KERNEL_8
@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
BLASLONG n1 = n & -16;
#else
BLASLONG n1 = n & -8;

View File

@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
{
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
__asm__
(
"dcbt 0, %2 \n\t"
@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
"xxswapd 33, 34 \n\t"
"xvaddsp 35, 35, 32 \n\t"
"xvaddsp 34, 34, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xxpermdi 34, 35, 34, 0 \n\t"
#else
"xxpermdi 34, 34, 35, 2 \n\t"
#endif
"stxv 34, 0(%6) \n\t"
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"

View File

@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cgemm_macros_power10.S"
#if (_AIX)
.set perm_const1, 0x0405060700010203
.set perm_const2, 0x0c0d0e0f08090a0b
.set save_permute_12, 0x1011121300010203
.set save_permute_11, 0x18191a1b08090a0b
#else
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
.equ save_permute_11, 0x0405060714151617
#endif
#ifndef NEEDPARAM
@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
#if (_AIX)
lis T2, (perm_const2>>48 & 0xFFFF)
lis T1, (perm_const1>>48 & 0xFFFF)
lis T3, (save_permute_12>>48 & 0xFFFF)
lis T4, (save_permute_11>>48 & 0xFFFF)
ori T2, T2, (perm_const2>>32 & 0xFFFF)
ori T1, T1, (perm_const1>>32 & 0xFFFF)
ori T3, T3, (save_permute_12>>32 & 0xFFFF)
ori T4, T4, (save_permute_11>>32 & 0xFFFF)
#else
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
#endif
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
#if (_AIX)
oris T2, T2, (perm_const2>>16 & 0xFFFF)
oris T1, T1, (perm_const1>>16 & 0xFFFF)
oris T3, T3, (save_permute_12>>16 & 0xFFFF)
oris T4, T4, (save_permute_11>>16 & 0xFFFF)
ori T2, T2, (perm_const2 & 0xFFFF)
ori T1, T1, (perm_const1 & 0xFFFF)
ori T3, T3, (save_permute_12 & 0xFFFF)
ori T4, T4, (save_permute_11 & 0xFFFF)
#else
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
#endif
li r0,0
li PRE,512

View File

@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 36, 34
xvf32gerpp 2, 37, 34
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 7, 36, 35
xvf32gerpp 6, 37, 35
xvf32gerpp 5, 32, 35
xvf32gerpp 4, 33, 35
#else
xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35
@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34
#endif
.endm
.macro LOAD4x8_2
@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xvf32gerpp 3, 36, 34
xvf32gerpp 2, 37, 34
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 7, 36, 35
xvf32gerpp 6, 37, 35
xvf32gerpp 5, 32, 35
xvf32gerpp 4, 33, 35
#else
xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35
@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xvf32gerpp 3, 42, 38
xvf32gerpp 2, 43, 38
xvf32gerpp 1, 40, 38
xvf32gerpp 0, 41, 38
xvf32gerpp 7, 42, 39
xvf32gerpp 6, 43, 39
xvf32gerpp 5, 40, 39
xvf32gerpp 4, 41, 39
#else
xvf32gerpp 3, 42, 39
xvf32gerpp 2, 43, 39
xvf32gerpp 1, 40, 39
@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 43, 38
xvf32gerpp 5, 40, 38
xvf32gerpp 4, 41, 38
#endif
.if \Complete==0
lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11
xvaddsp vs29, vs29, vs9
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs4, vs12, 1
xxpermdi vs26, vs6, vs14, 1
xxpermdi vs29, vs8, vs0, 1
xxpermdi vs28, vs10, vs2, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs2, vs10, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
#endif
stxvp vs24, 0(CO)
MULT_APLHA_PART1 vs48, vs56, vs0, vs1
@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs32, vs32, vs3
xvaddsp vs33, vs33, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs40, vs40, vs7
xvaddsp vs41, vs41, vs5
xvaddsp vs34, vs34, vs11
xvaddsp vs35, vs35, vs9
xvaddsp vs42, vs42, vs15
xvaddsp vs43, vs43, vs13
#else
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xxpermdi vs33, vs0, vs8, 1
xxpermdi vs32, vs2, vs10, 1
xxpermdi vs41, vs4, vs12, 1
xxpermdi vs40, vs6, vs14, 1
xxpermdi vs35, vs8, vs0, 1
xxpermdi vs34, vs10, vs2, 1
xxpermdi vs43, vs12, vs4, 1
xxpermdi vs42, vs14, vs6, 1
#else
xxpermdi vs33, vs8, vs0, 2
xxpermdi vs32, vs10, vs2, 2
@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs34, vs2, vs10, 2
xxpermdi vs43, vs4, vs12, 2
xxpermdi vs42, vs6, vs14, 2
#endif
#endif
stxvp vs32, 0(T2)
stxvp vs40, 32(T2)
@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 32, 35
xvf32gerpp 2, 33, 35
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
#else
xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35
#endif
.endm
.macro LOAD4x4_2
@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 32, 35
xvf32gerpp 2, 33, 35
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
#else
xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 36, 39
xvf32gerpp 2, 37, 39
xvf32gerpp 1, 36, 38
xvf32gerpp 0, 37, 38
#else
xvf32gerpp 3, 36, 38
xvf32gerpp 2, 37, 38
xvf32gerpp 1, 36, 39
xvf32gerpp 0, 37, 39
#endif
.if \Complete==0
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2
@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11
@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvaddsp vs29, vs29, vs5
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs8, vs0, 1
xxpermdi vs26, vs10, vs2, 1
xxpermdi vs29, vs4, vs12, 1
xxpermdi vs28, vs6, vs14, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs14, vs6, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
#endif
stxvp vs24, 0(CO)
stxvp vs26, 0(T1)
@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 35, 32
xvf32gerpp 0, 34, 32
#else
xvf32gerpp 1, 34, 32
xvf32gerpp 0, 35, 32
#endif
.endm
.macro LOAD4x2_2
@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 35, 32
xvf32gerpp 0, 34, 32
#else
xvf32gerpp 1, 34, 33
xvf32gerpp 0, 35, 33
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 37, 33
xvf32gerpp 0, 36, 33
#else
xvf32gerpp 1, 36, 32
xvf32gerpp 0, 37, 32
#endif
.if \Complete==0
lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 0
xxpermdi vs9, vs2, vs10, 0
xxpermdi vs3, vs8, vs0, 3
xxpermdi vs11, vs10, vs2, 3
#else
xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs10, vs2, 0
xxpermdi vs3, vs0, vs8, 3
xxpermdi vs11, vs2, vs10, 3
#endif
xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9
xvaddsp vs25, vs25, vs3
xvaddsp vs27, vs27, vs11
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs24, vs0, vs8, 0
xxpermdi vs26, vs2, vs10, 0
xxpermdi vs25, vs8, vs0, 3
xxpermdi vs27, vs10, vs2, 3
#else
xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs10, vs2, 0
xxpermdi vs25, vs0, vs8, 3
xxpermdi vs27, vs2, vs10, 3
#endif
#endif
stxv vs24, 0(CO)
stxv vs25, 0(T1)
@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 34, 32
xvf32gerpp 1, 35, 32
#else
xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32
#endif
.endm
.macro LOAD4x1_2
@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD4x1_2O OffsetA, OffsetB
lxv vs32, (\OffsetA)(AO)
vspltisb v6, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs33, vs32, vs38, 2
xxpermdi vs32, vs32, vs38, 0
#else
xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2
#endif
lxvp vs34, (0+\OffsetB)(BO)
lxvp vs36, (32+\OffsetB)(BO)
.endm
@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 34, 32
xvf32gerpp 1, 35, 32
#else
xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 36, 33
xvf32gerpp 1, 37, 33
#else
xvf32gerpp 0, 37, 33
xvf32gerpp 1, 36, 33
#endif
.if \Complete==0
lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs33, vs32, vs38, 2
xxpermdi vs32, vs32, vs38, 0
#else
xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2
#endif
.endif
.if \IsLast==1
.if \Complete==1
@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 2, 37, 34
xvf32gerpp 3, 36, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 1, 32, 34
#else
xvf32gerpp 2, 37, 35
xvf32gerpp 3, 36, 35
xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35
#endif
.if \Complete==0
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 2, 41, 35
xvf32gerpp 3, 40, 35
xvf32gerpp 0, 39, 35
xvf32gerpp 1, 38, 35
#else
xvf32gerpp 2, 41, 34
xvf32gerpp 3, 40, 34
xvf32gerpp 0, 39, 34
xvf32gerpp 1, 38, 34
#endif
.if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
@ -1068,22 +1262,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11
xvaddsp vs29, vs29, vs9
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs4, vs12, 1
xxpermdi vs26, vs6, vs14, 1
xxpermdi vs29, vs8, vs0, 1
xxpermdi vs28, vs10, vs2, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs2, vs10, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
#endif
stxvp vs24, 0(CO)
stxvp vs26, 32(CO)
@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 33, 34
xvf32gerpp 1, 32, 34
#else
xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35
#endif
.if \Complete==0
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 37, 35
xvf32gerpp 1, 36, 35
#else
xvf32gerpp 0, 37, 34
xvf32gerpp 1, 36, 34
#endif
.if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11
xvaddsp vs27, vs27, vs9
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs8, vs0, 1
xxpermdi vs26, vs10, vs2, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
xxpermdi vs27, vs0, vs8, 2
xxpermdi vs26, vs2, vs10, 2
#endif
#endif
stxvp vs24, 0(CO)
stxvp vs26, 0(T1)
@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxperm vs8, vs9, save_permute_1
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 0
xxpermdi vs9, vs8, vs0, 3
#else
xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs0, vs8, 3
#endif
xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs24, vs0, vs8, 0
xxpermdi vs26, vs8, vs0, 3
#else
xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs0, vs8, 3
#endif
#endif
stxv vs24, 0(CO)
stxv vs26, 0(T1)
@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs32, (0+\OffsetA)(AO)
lxvp vs36, (32+\OffsetA)(AO)
vspltisb v10, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs42, 2
xxpermdi vs34, vs34, vs42, 0
#else
xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2
#endif
lxvp vs38, (64+\OffsetA)(AO)
lxvp vs40, (64+32+\OffsetA)(AO)
.endm
@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 3, 35, 40
.if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs42, 2
xxpermdi vs34, vs34, vs42, 0
#else
xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2
#endif
lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
.endif
.if \IsLast==1
@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART2 vs34, vs42, vs4, vs5
MULT_APLHA_PART2 vs35, vs43, vs6, vs7
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
xxperm vs2, vs3, save_permute_1
xxperm vs4, vs5, save_permute_1
xxperm vs6, vs7, save_permute_1
#else
xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28
xxperm vs4, vs5, vs28
xxperm vs6, vs7, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs2
@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvp vs26, 32(CO)
#else
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
stxv vs2, 0(CO)
stxv vs0, 16(CO)
stxv vs6, 32(CO)
stxv vs4, 48(CO)
#else
stxv vs0, 0(CO)
stxv vs2, 16(CO)
stxv vs4, 32(CO)
stxv vs6, 48(CO)
#endif
#endif
addi CO, CO, 64
.endm
@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxv vs34, (\OffsetB)(BO)
lxvp vs32, (0+\OffsetA)(AO)
vspltisb v6, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs38, 2
xxpermdi vs34, vs34, vs38, 0
#else
xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2
#endif
lxvp vs36, (32+\OffsetA)(AO)
.endm
@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 1, 35, 36
.if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs38, 2
xxpermdi vs34, vs34, vs38, 0
#else
xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2
#endif
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
.endif
.if \IsLast==1
@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs33, vs41, vs2, vs3
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
xxperm vs2, vs3, save_permute_1
#else
xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs2
@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvp vs24, 0(CO)
#else
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
stxv vs2, 0(CO)
stxv vs0, 16(CO)
#else
stxv vs0, 0(CO)
stxv vs2, 16(CO)
#endif
#endif
addi CO, CO, 32
.endm
@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART1 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
#else
xxperm vs0, vs1, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs0
@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART1 vs32, vs40, vs37, vs1
MULT_APLHA_PART2 vs32, vs40, vs37, vs1
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs37, vs1, save_permute_1
#else
xxperm vs37, vs1, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs36, vs36, vs37

View File

@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
{
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
__asm__
(
"dcbt 0, %2 \n\t"

View File

@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "cswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "cswap_microk_power10.c"
#elif defined(POWER10)
#include "cswap_microk_power8.c"
#include "cswap_microk_power10.c"
#endif
#endif

View File

@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dasum_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "dasum_microk_power10.c"
#elif defined(POWER10)
#include "dasum_microk_power8.c"
#include "dasum_microk_power10.c"
#endif
#endif
#ifndef HAVE_KERNEL_16
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32)
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -0,0 +1,923 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
result[2] = vec_mul(result[2], valpha); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
result[3] = vec_mul(result[3], valpha); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#if (defined(__GNUC__) && (__GNUC__ == 10))
#if defined(_AIX)
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
#endif
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
#endif
#define LOAD_A_1x8(K, M) \
ra0 = vec_xl(0, A+((K)*lda)+M+0); \
ra1 = vec_xl(0, A+((K)*lda)+M+2); \
ra2 = vec_xl(0, A+((K)*lda)+M+4); \
ra3 = vec_xl(0, A+((K)*lda)+M+6);
#define LOAD_A_1x4(K, M) \
ra0 = vec_xl(0, A+((K)*lda)+M+0); \
ra1 = vec_xl(0, A+((K)*lda)+M+2); \
#define LOAD_A_1x2(K, M) \
ra0 = vec_xl(0, A+((K)*lda)+M+0);
#define LOAD_A_1x1(K, M) \
ra0 = vec_splats(A[((K)*lda)+M+0]);
#define LOAD_BTP_8x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb2, t0, t1); \
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
t0 = vec_mergeh(rb4, rb5); \
t1 = vec_mergeh(rb6, rb7); \
LOAD_PAIR(pb1, t0, t1); \
t0 = vec_mergel(rb4, rb5); \
t1 = vec_mergel(rb6, rb7); \
LOAD_PAIR(pb3, t0, t1);
#define LOAD_BTP_8x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1); \
rb2 = vec_xor(rb2, rb2); \
rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \
rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \
rb3 = vec_xor(rb3, rb3); \
rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \
rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \
LOAD_PAIR(pb1, rb2, rb3);
#define LOAD_BTP_4x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb1, t0, t1);
#define LOAD_BTP_4x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1);
#define LOAD_BTP_2x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \
t1 = vec_mergel(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
#define LOAD_BTP_2x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
#define LOAD_B_1x1(N, K) \
rb0 = vec_splats(B[((N)*ldb)+K]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#define PACK_B(pb0, pb1, offset) \
*((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \
*((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;
#define LOAD_PACKED_B(pb0, pb1, offset) \
pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \
pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset)));
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
BLASLONG k2 = K & ~1;
#if defined(__GNUC__) && !defined(__clang__)
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
#else
int has_packing = 0;
#endif
double *packB;
if (has_packing) packB = (double *)malloc(K*8*sizeof(double));
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (n = 0; n < n8; n += 8) {
for (m = 0; m < m8; m += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (has_packing) {
if (m == 0) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_B(pb0, pb1, 0);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_B(pb2, pb3, 8);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_B(pb0, pb1, 0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
LOAD_A_1x8(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc4, n+0, m+4);
SAVE_4x2_ACC(&acc6, n+0, m+6);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
SAVE_4x2_ACC(&acc5, n+4, m+4);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; m < m4; m += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
LOAD_A_1x4(k+1, m);
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
LOAD_A_1x4(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; m < m2; m += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x2(k+1, m);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x2(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; m < M; m++) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x1(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n+0, m+0);
SAVE_4x1_ACC(&acc1, n+4, m+0);
}
}
for (; n < n4; n += 4) {
for (m = 0; m < m8; m += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; m < m4; m += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
LOAD_A_1x4(k+1, m);
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; m < m2; m += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x2(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; m < M; m++) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n, m);
}
}
for (; n < n2; n += 2) {
for (m = 0; m < m8; m += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; m < m4; m += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
LOAD_A_1x4(k+1, m);
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; m < m2; m += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x2(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
}
for (; m < M; m++) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x1_ACC(&acc0, n+0, m+0);
}
}
for (; n < N; n++) {
for (m = 0; m < m8; m += 8) {
vector double result = ((vector double){0.,0.});
vector double result1 = ((vector double){0.,0.});
vector double result2 = ((vector double){0.,0.});
vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x1(n, k);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
for (; m < m4; m += 4) {
vector double result = ((vector double){0.,0.});
vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x1(n, k);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
for (; m < m2; m += 2) {
vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x1(n, k);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
for (; m < M; m++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[m+k*lda] * B[n*ldb+k];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
if (has_packing) free(packB);
return 0;
}

View File

@ -0,0 +1,581 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1];
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#define LOAD_A_1x8(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+2); \
ra2 = vec_xl(0, A+(K*lda)+M+4); \
ra3 = vec_xl(0, A+(K*lda)+M+6);
#define LOAD_A_1x4(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+2);
#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);
#define LOAD_BP_1x8(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
#define LOAD_BP_1x4(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+(K*ldb)+N); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
#define LOAD_B_1x8(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2); \
rb2 = vec_xl(0, B+(K*ldb)+N+4); \
rb3 = vec_xl(0, B+(K*ldb)+N+6); \
#define LOAD_B_1x4(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2);
#define LOAD_B_1x2(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0);
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (m = 0; m < m8; m += 8) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3;
__vector_pair pb0, pb1;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc4, n+0, m+4);
SAVE_4x2_ACC(&acc6, n+0, m+6);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
SAVE_4x2_ACC(&acc5, n+4, m+4);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1;
__vector_pair pb0, pb1;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BP_1x8(k, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BP_1x4(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double t0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BP_1x2(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0;
__vector_pair pb0, pb1;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BP_1x8(k, n);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BP_1x4(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double t0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BP_1x2(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n, m);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
}
for (; m < M; m++) {
for (n = 0; n < n8; n += 8) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x8(k, n);
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
SAVE_4x1_VSR(result2, n+4, m);
SAVE_4x1_VSR(result3, n+6, m);
}
for (; n < n4; n += 4) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x4(k, n);
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
}
for (; n < n2; n += 2) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_4x1_VSR(result, n, m);
}
for (; n < N; n++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[k*lda+m] * B[k*ldb+n];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,882 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
result[2] = vec_mul(result[2], valpha); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
result[3] = vec_mul(result[3], valpha); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#if (defined(__GNUC__) && (__GNUC__ == 10))
#if defined(_AIX)
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
#endif
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
#endif
#define LOAD_AT_8x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergel(ra0, ra1); \
ra0 = t0; \
ra1 = t1; \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra2, ra3); \
t1 = vec_mergel(ra2, ra3); \
ra2 = t0; \
ra3 = t1; \
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
t0 = vec_mergeh(ra4, ra5); \
t1 = vec_mergel(ra4, ra5); \
ra4 = t0; \
ra5 = t1; \
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
t0 = vec_mergeh(ra6, ra7); \
t1 = vec_mergel(ra6, ra7); \
ra6 = t0; \
ra7 = t1;
#define LOAD_AT_8x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
ra2 = vec_xor(ra2, ra2); \
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
ra3 = vec_xor(ra3, ra3); \
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
#define LOAD_AT_4x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergeh(ra2, ra3); \
t2 = vec_mergel(ra0, ra1); \
t3 = vec_mergel(ra2, ra3); \
ra0 = t0; \
ra1 = t2; \
ra2 = t1; \
ra3 = t3;
#define LOAD_AT_4x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
#define LOAD_AT_2x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergel(ra0, ra1); \
ra0 = t0; \
ra1 = t1;
#define LOAD_AT_2x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
#define LOAD_A_1x1(K, M) \
ra0 = vec_splats(A[((M+0)*lda)+K+0]);
#define LOAD_BTP_8x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb2, t0, t1); \
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
t0 = vec_mergeh(rb4, rb5); \
t1 = vec_mergeh(rb6, rb7); \
LOAD_PAIR(pb1, t0, t1); \
t0 = vec_mergel(rb4, rb5); \
t1 = vec_mergel(rb6, rb7); \
LOAD_PAIR(pb3, t0, t1);
#define LOAD_BTP_8x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1); \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \
LOAD_PAIR(pb1, rb0, rb1);
#define LOAD_BTP_4x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb1, t0, t1);
#define LOAD_BTP_4x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1);
#define LOAD_BTP_2x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \
t1 = vec_mergel(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
#define LOAD_BTP_2x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_MMA_1ACC_(acc, b0, a0) \
__builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
BLASLONG k2 = K & ~1;
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (m = 0; m < m8; m += 8) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1,
ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6);
KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3,
ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7);
}
// workaround to avoid register spilling
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC_(acc0, pb0, ra0);
KERNEL_MMA_1ACC_(acc1, pb0, ra1);
LOAD_AT_4x1(m+4, k);
KERNEL_MMA_1ACC_(acc2, pb0, ra0);
KERNEL_MMA_1ACC_(acc3, pb0, ra1);
LOAD_AT_4x1(m, k);
LOAD_BTP_4x1(n+4, k);
KERNEL_MMA_1ACC_(acc4, pb0, ra0);
KERNEL_MMA_1ACC_(acc5, pb0, ra1);
LOAD_AT_4x1(m+4, k);
KERNEL_MMA_1ACC_(acc6, pb0, ra0);
KERNEL_MMA_1ACC_(acc7, pb0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc4, n+4, m+0);
SAVE_4x2_ACC(&acc6, n+4, m+4);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc3, n+0, m+6);
SAVE_4x2_ACC(&acc5, n+4, m+2);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_B_1x1(n, k);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2);
KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_B_1x1(n, k);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
KERNEL_MMA_1ACC(pb1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
KERNEL_MMA_1ACC(pb1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n, m);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_B_1x1(n, k);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
}
for (; m < M; m++) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n+0, m+0);
SAVE_4x1_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x1_ACC(&acc0, n+0, m+0);
}
for (; n < N; n++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[m*lda+k] * B[n*ldb+k];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,829 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1];
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#define LOAD_AT_8x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergeh(ra2, ra3); \
t2 = vec_mergel(ra0, ra1); \
t3 = vec_mergel(ra2, ra3); \
ra0 = t0; \
ra1 = t2; \
ra2 = t1; \
ra3 = t3; \
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
t0 = vec_mergeh(ra4, ra5); \
t1 = vec_mergeh(ra6, ra7); \
t2 = vec_mergel(ra4, ra5); \
t3 = vec_mergel(ra6, ra7); \
ra4 = t0; \
ra5 = t2; \
ra6 = t1; \
ra7 = t3;
#define LOAD_AT_8x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
ra2 = vec_xor(ra2, ra2); \
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
ra3 = vec_xor(ra3, ra3); \
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
#define LOAD_AT_4x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergeh(ra2, ra3); \
t2 = vec_mergel(ra0, ra1); \
t3 = vec_mergel(ra2, ra3); \
ra0 = t0; \
ra1 = t2; \
ra2 = t1; \
ra3 = t3;
#define LOAD_AT_4x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
#define LOAD_AT_2x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergel(ra0, ra1); \
ra0 = t0; \
ra1 = t1;
#define LOAD_AT_2x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
#define LOAD_BP_1x8(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
#define LOAD_BP_1x4(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+((K)*ldb)+N); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
#define LOAD_B_1x8(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2); \
rb2 = vec_xl(0, B+(K*ldb)+N+4); \
rb3 = vec_xl(0, B+(K*ldb)+N+6); \
#define LOAD_B_1x4(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2);
#define LOAD_B_1x2(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0);
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#define PACK_A(ra0, ra1, ra2, ra3, offset) \
vec_xst(ra0, 0, packA+(k*8)+0+offset); \
vec_xst(ra1, 0, packA+(k*8)+2+offset); \
vec_xst(ra2, 0, packA+(k*8)+4+offset); \
vec_xst(ra3, 0, packA+(k*8)+6+offset);
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
ra0 = vec_xl(0, packA+(k*8)+0+offset); \
ra1 = vec_xl(0, packA+(k*8)+2+offset); \
ra2 = vec_xl(0, packA+(k*8)+4+offset); \
ra3 = vec_xl(0, packA+(k*8)+6+offset);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
BLASLONG k2 = K & ~1;
#if defined(__GNUC__) && !defined(__clang__)
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
#else
int has_packing = 0;
#endif
double *packA;
if (has_packing) packA = (double *)malloc(K*8*sizeof(double));
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (m = 0; m < m8; m += 8) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
if (has_packing) {
if (n == 0) {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
PACK_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
PACK_A(ra1, ra3, ra5, ra7, 8);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_A(ra0, ra1, ra2, ra3, 0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
}
for (; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc4, n+0, m+4);
SAVE_4x2_ACC(&acc6, n+0, m+6);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
SAVE_4x2_ACC(&acc5, n+4, m+4);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_B_1x1(k, n);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double t0, t1;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_1ACC(pb0, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double t0, t1;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_1ACC(pb0, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n, m);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
}
for (; m < M; m++) {
for (n = 0; n < n8; n += 8) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x1(m, k);
LOAD_B_1x8(k, n);
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
SAVE_4x1_VSR(result2, n+4, m);
SAVE_4x1_VSR(result3, n+6, m);
}
for (; n < n4; n += 4) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x1(m, k);
LOAD_B_1x4(k, n);
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
}
for (; n < n2; n += 2) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(m, k);
LOAD_B_1x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_4x1_VSR(result, n, m);
}
for (; n < N; n++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[m*lda+k] * B[k*ldb+n];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
if(has_packing) free(packA);
return 0;
}

View File

@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
XXSPLTD_S(32,%x9,0) // alpha, alpha
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha
#else
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
#endif
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
"add %6, %6, %6 \n\t" // 2 * lda
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
#endif
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
"add %10, %10, %10 \n\t" // 2 * lda
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha
XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha
#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
#endif
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"one%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 37, 41, 32 \n\t"
#else
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
#endif
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
#else
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t"
#endif
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 44, 34 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
#else
"xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t"
#endif
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 46, 35 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
#else
"xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t"
#endif
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 50, 38 \n\t"
"xvmaddadp 37, 51, 38 \n\t"
#else
"xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t"
#endif
"lxvpx 50, %7, %11 \n\t" // a4[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 52, 39 \n\t"
"xvmaddadp 37, 53, 39 \n\t"
#else
"xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t"
#endif
"lxvpx 52, %8, %11 \n\t" // a5[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 54, 48 \n\t"
"xvmaddadp 37, 55, 48 \n\t"
#else
"xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
#endif
"lxvpx 54, %9, %11 \n\t" // a6[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 56, 49 \n\t"
"xvmaddadp 37, 57, 49 \n\t"
#else
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
#endif
"lxvpx 56, %10, %11 \n\t" // a7[0]
"addi %11, %11, 32 \n\t"
@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"two%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 37, 41, 32 \n\t"
"xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
"xvmaddadp 36, 44, 34 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
"xvmaddadp 36, 46, 35 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
"xvmaddadp 36, 50, 38 \n\t"
"xvmaddadp 37, 51, 38 \n\t"
"xvmaddadp 36, 52, 39 \n\t"
"xvmaddadp 37, 53, 39 \n\t"
"xvmaddadp 36, 54, 48 \n\t"
"xvmaddadp 37, 55, 48 \n\t"
"xvmaddadp 36, 56, 49 \n\t"
"xvmaddadp 37, 57, 49 \n\t"
#else
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
"xvmaddadp 36, 42, 35 \n\t"
@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"xvmaddadp 37, 55, 38 \n\t"
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
#endif
"stxvp 36, 0( %2) \n\t" // y0, y1
:

View File

@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvp 40, 32(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(42,34,35)
XXMRGLD_S(43,34,35)
XXMRGHD_S(44,4,5)
XXMRGLD_S(45,4,5)
#else
XXMRGLD_S(42,35,34)
XXMRGHD_S(43,35,34)
XXMRGLD_S(44,5,4)
XXMRGHD_S(45,5,4)
#endif
"xvadddp 42,42,43 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(46,6,7)
XXMRGLD_S(47,6,7)
#else
XXMRGLD_S(46,7,6)
XXMRGHD_S(47,7,6)
#endif
"xvadddp 44,44,45 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(48,8,9)
XXMRGLD_S(49,8,9)
#else
XXMRGLD_S(48,9,8)
XXMRGHD_S(49,9,8)
#endif
"xvadddp 46,46,47 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 38,42,36 \n\t"
"xvmaddadp 39,44,36 \n\t"
#else
"xvmaddadp 39,42,36 \n\t"
"xvmaddadp 38,44,36 \n\t"
#endif
"xvadddp 48,48,49 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 41,48,36 \n\t"
#else
"xvmaddadp 41,46,36 \n\t"
#endif
"stxvp 38, 0(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 40,46,36 \n\t"
#else
"xvmaddadp 40,48,36 \n\t"
#endif
"stxvp 40, 32(%[y]) \n\t"
: [memy] "+m" (*(double (*)[8])y),

View File

@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "drot_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "drot_microk_power10.c"
#elif defined(POWER10)
#include "drot_microk_power8.c"
#include "drot_microk_power10.c"
#endif
#endif
@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dscal_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "dscal_microk_power10.c"
#elif defined(POWER10)
#include "dscal_microk_power8.c"
#include "dscal_microk_power10.c"
#endif
#endif
@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "swap_microk_power10.c"
#elif defined(POWER10)
#include "dswap_microk_power8.c"
#include "swap_microk_power10.c"
#endif
#endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 ))
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -0,0 +1,84 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
{
double MNK = (double) M * (double) N * (double) K;
#if defined(DOUBLE) // dgemm
// gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This
// issue affects both dgemm_nn and dgemm_tn.
#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))
if (!transb)
return 0;
#endif
if (MNK <= 54.0*54.0*54.0)
return 1;
#else // sgemm
#if defined(__GNUC__) && defined(__clang__)
// clang generates code with register spilling for the region of code with
// packing, thus, we had to disable this optimization for clang. Given that
// the packing on-demand used in this work is one of the reasons that lead the
// small kernels to outperform the normal flow (when MNK increases), with it
// disabled we had to reduce the MNK inputs used by the code generated by clang.
if (MNK > 84.0*84.0*84.0)
return 0;
if (transa && !transb) {
// sgemm_tn works better when packing on-demand is used
if (MNK <= 64.0*64.0*64.0 && K >= 4)
return 1;
else
return 0;
}
#else // gcc
if (MNK > 100.0*100.0*100.0)
return 0;
#endif
// Multi-threading execution outperforms (or approaches) the execution of the
// small kernel.
if (num_cpu_avail(3) > 1) {
if (MNK <= 64.0*64.0*64.0)
return 1;
} else {
return 1;
}
#endif
return 0;
}

View File

@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sasum_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "sasum_microk_power10.c"
#elif defined(POWER10)
#include "sasum_microk_power8.c"
#include "sasum_microk_power10.c"
#endif
#endif
@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,887 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !defined(B0)
#define SAVE_4x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_2x2_VSR(result, N, M) \
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst_len(result, C+(N*ldc)+M, 8); \
C[(N+1)*ldc+M+0] = result[2]; \
C[(N+1)*ldc+M+1] = result[3];
#define SAVE_1x2_VSR(result, N, M) \
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst_len(result, C+(N*ldc)+M, 8);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
#define SAVE_2x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
#else
#define SAVE_4x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
result[2] = vec_mul(result[2], valpha); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
result[3] = vec_mul(result[3], valpha); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_2x2_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst_len(result, C+(N*ldc)+M, 8); \
C[(N+1)*ldc+M+0] = result[2]; \
C[(N+1)*ldc+M+1] = result[3];
#define SAVE_1x2_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst_len(result, C+(N*ldc)+M, 8);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1]; \
C[(N+2)*ldc+M] = result[2]; \
C[(N+3)*ldc+M] = result[3];
#define SAVE_2x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1];
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#define LOAD_A_1x16(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+4); \
ra2 = vec_xl(0, A+(K*lda)+M+8); \
ra3 = vec_xl(0, A+(K*lda)+M+12);
#define LOAD_A_1x8(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+4);
#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
#define LOAD_A_2x2(K, M) \
ra0 = vec_splats(A[K*lda+M+0]); \
ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \
ra0 = vec_insert(A[K*lda+M+1], ra0, 3);
#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8);
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]);
#define LOAD_B_1x16(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+4); \
rb2 = vec_xl(0, B+(K*ldb)+N+8); \
rb3 = vec_xl(0, B+(K*ldb)+N+12);
#define LOAD_B_1x8(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+4);
#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N);
#define LOAD_B_2x2(K, N) \
rb0 = vec_splats(B[K*ldb+N]); \
rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \
rb0 = vec_insert(B[K*ldb+N+1], rb0, 3);
#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8);
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \
__builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \
__builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \
__builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \
__builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#define PACK_A(ra0, ra1, ra2, ra3, offset) \
vec_xst(ra0, 0, packA+(k*16)+0+offset); \
vec_xst(ra1, 0, packA+(k*16)+4+offset); \
vec_xst(ra2, 0, packA+(k*16)+8+offset); \
vec_xst(ra3, 0, packA+(k*16)+12+offset);
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
ra0 = vec_xl(0, packA+(k*16)+0+offset); \
ra1 = vec_xl(0, packA+(k*16)+4+offset); \
ra2 = vec_xl(0, packA+(k*16)+8+offset); \
ra3 = vec_xl(0, packA+(k*16)+12+offset);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m16 = M & ~15;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n16 = N & ~15;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
vector float valpha = vec_splats(alpha);
#if !defined(B0)
vector float vbeta = vec_splats(beta);
#endif
#if defined(__GNUC__) && !defined(__clang__)
int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0;
#else
int has_packing = 0;
#endif
float *packA;
if (has_packing) packA = (float *)malloc(K*16*sizeof(float));
for (m = 0; m < m16; m += 16) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector float ra0, ra1, ra2, ra3;
register vector float rb0, rb1;
if (has_packing) {
if (n == 0) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_A(ra0, ra1, ra2, ra3, 0);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x8(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
} else {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc2, n+0, m+4);
SAVE_4x4_ACC(&acc4, n+0, m+8);
SAVE_4x4_ACC(&acc6, n+0, m+12);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc3, n+4, m+4);
SAVE_4x4_ACC(&acc5, n+4, m+8);
SAVE_4x4_ACC(&acc7, n+4, m+12);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0, ra1, ra2, ra3;
register vector float rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x4(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+0, m+4);
SAVE_4x4_ACC(&acc2, n+0, m+8);
SAVE_4x4_ACC(&acc3, n+0, m+12);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0, ra1, ra2, ra3;
register vector float rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x2(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x2(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_2x4_ACC(&acc0, n, m+0);
SAVE_2x4_ACC(&acc1, n, m+4);
SAVE_2x4_ACC(&acc2, n, m+8);
SAVE_2x4_ACC(&acc3, n, m+12);
}
for (; n < N; n++) {
vector float result = ((vector float){0., 0., 0., 0.});
vector float result1 = ((vector float){0., 0., 0., 0.});
vector float result2 = ((vector float){0., 0., 0., 0.});
vector float result3 = ((vector float){0., 0., 0., 0.});
register vector float ra0, ra1, ra2, ra3;
register vector float rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x4_VSR(result, n, m);
SAVE_1x4_VSR(result1, n, m+4);
SAVE_1x4_VSR(result2, n, m+8);
SAVE_1x4_VSR(result3, n, m+12);
}
}
for (; m < m8; m += 8) {
for (n = 0; n < n16; n += 16) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector float ra0, ra1;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x16(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc4, n+0, m+4);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc5, n+4, m+4);
SAVE_4x4_ACC(&acc2, n+8, m+0);
SAVE_4x4_ACC(&acc6, n+8, m+4);
SAVE_4x4_ACC(&acc3, n+12, m+0);
SAVE_4x4_ACC(&acc7, n+12, m+4);
}
for (; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0, ra1;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc2, n+0, m+4);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc3, n+4, m+4);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0, ra1;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+0, m+4);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0, ra1;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x2(k, n);
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_2x4_ACC(&acc0, n, m+0);
SAVE_2x4_ACC(&acc1, n, m+4);
}
for (; n < N; n++) {
vector float result = ((vector float){0.,0.,0.,0.});
vector float result1 = ((vector float){0.,0.,0.,0.});
register vector float ra0, ra1;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x4_VSR(result, n, m);
SAVE_1x4_VSR(result1, n, m+4);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n16; n += 16) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x16(k, n);
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc2, n+8, m+0);
SAVE_4x4_ACC(&acc3, n+12, m+0);
}
for (; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_1ACC(rb0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x2(k, n);
KERNEL_MMA_1ACC(rb0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_2x4_ACC(&acc0, n, m);
}
for (; n < N; n++) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x4_VSR(result, n, m);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n16; n += 16) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x16(k, n);
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc2, n+8, m+0);
SAVE_4x2_ACC(&acc3, n+12, m+0);
}
for (; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_1ACC(rb0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
}
for (; n < n2; n += 2) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_2x2(k, m);
LOAD_B_2x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_2x2_VSR(result, n, m);
}
for (; n < N; n++) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x2_VSR(result, n, m);
}
}
for (; m < M; m++) {
for (n = 0; n < n16; n += 16) {
vector float result = ((vector float){0.,0.,0.,0.});
vector float result1 = ((vector float){0.,0.,0.,0.});
vector float result2 = ((vector float){0.,0.,0.,0.});
vector float result3 = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x16(k, n);
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
}
SAVE_4x1_VSR(result, n+0, m);
SAVE_4x1_VSR(result1, n+4, m);
SAVE_4x1_VSR(result2, n+8, m);
SAVE_4x1_VSR(result3, n+12, m);
}
for (; n < n8; n += 8) {
vector float result = ((vector float){0.,0.,0.,0.});
vector float result1 = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x8(k, n);
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
}
SAVE_4x1_VSR(result, n+0, m);
SAVE_4x1_VSR(result1, n+4, m);
}
for (; n < n4; n += 4) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x4(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_4x1_VSR(result, n+0, m);
}
for (; n < n2; n += 2) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_2x1_VSR(result, n+0, m);
}
for (; n < N; n++) {
FLOAT result = 0.0f;
for (k = 0; k < K; k++) {
result += A[k*lda+m] * B[k*ldb+n];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
if (has_packing) free (packA);
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "srot_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "srot_microk_power10.c"
#elif defined(POWER10)
#include "srot_microk_power8.c"
#include "srot_microk_power10.c"
#endif
#endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sscal_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "sscal_microk_power10.c"
#elif defined(POWER10)
#include "sscal_microk_power8.c"
#include "sscal_microk_power10.c"
#endif
#endif
@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "swap_microk_power10.c"
#elif defined(POWER10)
#include "sswap_microk_power8.c"
#include "swap_microk_power10.c"
#endif
#endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 ))
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 64 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
double alpha_r, double alpha_i)
{
#if !defined(CONJ)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static const double mvec[2] = { -1.0, 1.0 };
#else
static const double mvec[2] = { 1.0, -1.0 };
#endif
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static const double mvec[2] = { 1.0, -1.0 };
#else
static const double mvec[2] = { -1.0, 1.0 };
#endif
#endif
const double *mvecp = mvec;

Some files were not shown because too many files have changed in this diff Show More