Merge pull request #3488 from xianyi/develop

Update from develop branch for 0.3.19 release
This commit is contained in:
Martin Kroeker 2021-12-19 20:54:49 +01:00 committed by GitHub
commit 488911486a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
163 changed files with 24400 additions and 998 deletions

View File

@ -3,10 +3,13 @@
## ##
cmake_minimum_required(VERSION 2.8.5) cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM) project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 19) set(OpenBLAS_PATCH_VERSION 19)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions # Adhere to GNU filesystem layout conventions
@ -20,51 +23,68 @@ endif()
####### #######
if(MSVC) if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif() endif()
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else() else()
set(NO_AFFINITY 1) set(NO_AFFINITY 1)
endif() endif()
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
option(BUILD_STATIC_LIBS "Build static library" OFF)
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
endif()
if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC)
message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS")
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library. # Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using # Avoids conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS. # 64 bit integer interfaces in OpenBLAS.
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
####### #######
if(BUILD_WITHOUT_LAPACK) if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1) set(NO_LAPACK 1)
set(NO_LAPACKE 1) set(NO_LAPACKE 1)
endif() endif()
if(BUILD_WITHOUT_CBLAS) if(BUILD_WITHOUT_CBLAS)
set(NO_CBLAS 1) set(NO_CBLAS 1)
endif() endif()
####### #######
if(MSVC AND MSVC_STATIC_CRT) if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags set(CompilerFlags
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_RELEASE
) )
foreach(CompilerFlag ${CompilerFlags}) foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach() endforeach()
endif() endif()
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
@ -98,7 +118,7 @@ endif ()
# set which float types we want to build for # set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all # if none are defined, build for all
# set(BUILD_BFLOAT16 true) # set(BUILD_BFLOAT16 true)
set(BUILD_SINGLE true) set(BUILD_SINGLE true)
set(BUILD_DOUBLE true) set(BUILD_DOUBLE true)
set(BUILD_COMPLEX true) set(BUILD_COMPLEX true)
@ -143,9 +163,10 @@ endif ()
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
if(MSVC) if(MSVC)
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
endif () endif ()
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) # get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "") set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS}) foreach (SUBDIR ${SUBDIRS})
@ -183,12 +204,61 @@ if (${DYNAMIC_ARCH})
endif () endif ()
# add objects to the openblas lib # add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) if(NOT NO_LAPACK)
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) add_library(LAPACK OBJECT ${LA_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>")
endif()
if(NOT NO_LAPACKE)
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
endif()
if(BUILD_RELAPACK)
add_library(RELAPACK OBJECT ${RELA_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
endif()
set(OpenBLAS_LIBS "")
if(BUILD_STATIC_LIBS)
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static)
endif()
if(BUILD_SHARED_LIBS)
add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared)
endif()
if(BUILD_STATIC_LIBS)
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static)
else()
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared)
endif()
set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
# Android needs to explicitly link against libm # Android needs to explicitly link against libm
if(ANDROID) if(ANDROID)
target_link_libraries(${OpenBLAS_LIBNAME} m) if(BUILD_STATIC_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_static m)
endif()
if(BUILD_SHARED_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_shared m)
endif()
endif()
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
if (NOT NOFORTRAN)
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
else ()
set (CMAKE_C_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
endif ()
endif() endif()
# Handle MSVC exports # Handle MSVC exports
@ -197,21 +267,21 @@ if(MSVC AND BUILD_SHARED_LIBS)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else() else()
# Creates verbose .def file (51KB vs 18KB) # Creates verbose .def file (51KB vs 18KB)
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
endif() endif()
endif() endif()
# Set output for libopenblas # Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
endforeach() endforeach()
enable_testing() enable_testing()
@ -220,10 +290,17 @@ if (USE_THREAD)
# Add threading library to linker # Add threading library to linker
find_package(Threads) find_package(Threads)
if (THREADS_HAVE_PTHREAD_ARG) if (THREADS_HAVE_PTHREAD_ARG)
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread") set_target_properties(${OpenBLAS_LIBS} PROPERTIES
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") COMPILE_OPTIONS "-pthread"
INTERFACE_COMPILE_OPTIONS "-pthread"
)
endif()
if(BUILD_STATIC_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT})
endif()
if(BUILD_SHARED_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT})
endif() endif()
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif() endif()
#if (MSVC OR NOT NOFORTRAN) #if (MSVC OR NOT NOFORTRAN)
@ -239,97 +316,109 @@ if (NOT NOFORTRAN)
add_subdirectory(ctest) add_subdirectory(ctest)
endif() endif()
add_subdirectory(lapack-netlib/TESTING) add_subdirectory(lapack-netlib/TESTING)
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test) add_subdirectory(cpp_thread_test)
endif() endif()
endif() endif()
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES set_target_properties(${OpenBLAS_LIBS} PROPERTIES
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
SOVERSION ${OpenBLAS_MAJOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION}
) )
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC) if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
else() else()
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
endif() endif()
endif() endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
if (NOT DEFINED ARCH) if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64") set(ARCH_IN "x86_64")
else() else()
set(ARCH_IN ${ARCH}) set(ARCH_IN ${ARCH})
endif() endif()
if (${CORE} STREQUAL "generic") if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC") set(ARCH_IN "GENERIC")
endif () endif ()
if (NOT DEFINED EXPRECISION) if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0) set(EXPRECISION_IN 0)
else() else()
set(EXPRECISION_IN ${EXPRECISION}) set(EXPRECISION_IN ${EXPRECISION})
endif() endif()
if (NOT DEFINED NO_CBLAS) if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0) set(NO_CBLAS_IN 0)
else() else()
set(NO_CBLAS_IN ${NO_CBLAS}) set(NO_CBLAS_IN ${NO_CBLAS})
endif() endif()
if (NOT DEFINED NO_LAPACK) if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0) set(NO_LAPACK_IN 0)
else() else()
set(NO_LAPACK_IN ${NO_LAPACK}) set(NO_LAPACK_IN ${NO_LAPACK})
endif() endif()
if (NOT DEFINED NO_LAPACKE) if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0) set(NO_LAPACKE_IN 0)
else() else()
set(NO_LAPACKE_IN ${NO_LAPACKE}) set(NO_LAPACKE_IN ${NO_LAPACKE})
endif() endif()
if (NOT DEFINED NEED2UNDERSCORES) if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0) set(NEED2UNDERSCORES_IN 0)
else() else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif() endif()
if (NOT DEFINED ONLY_CBLAS) if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0) set(ONLY_CBLAS_IN 0)
else() else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS}) set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif() endif()
if (NOT DEFINED BU) if (NOT DEFINED BU)
set(BU _) set(BU _)
endif() endif()
if (NOT ${SYMBOLPREFIX} STREQUAL "") if (NOT ${SYMBOLPREFIX} STREQUAL "")
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif() endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "") if (NOT ${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif() endif()
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMENT "renaming symbols" COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
) COMMENT "renaming symbols"
)
endif() endif()
# Install project # Install project
# Install libraries # Install libraries
install(TARGETS ${OpenBLAS_LIBNAME} if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
EXPORT "OpenBLAS${SUFFIX64}Targets" install(TARGETS ${OpenBLAS_LIBNAME}_shared
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} EXPORT "OpenBLAS${SUFFIX64}Targets"
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
install(TARGETS ${OpenBLAS_LIBNAME}_static
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
else()
install(TARGETS ${OpenBLAS_LIBS}
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
endif()
# Install headers # Install headers
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
@ -365,36 +454,41 @@ if(NOT NOFORTRAN)
endif() endif()
if(NOT NO_CBLAS) if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
if (NOT ${SYMBOLPREFIX} STREQUAL "") if (NOT ${SYMBOLPREFIX} STREQUAL "")
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif() endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "") if (NOT ${SYMBOLSUFFIX} STREQUAL "")
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif() endif()
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif() endif()
if(NOT NO_LAPACKE) if(NOT NO_LAPACKE)
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) if(BUILD_STATIC_LIBS)
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke)
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif()
if(BUILD_SHARED_LIBS)
add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke)
endif()
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
ADD_CUSTOM_TARGET(genlapacke ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
) )
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
endif() endif()
# Install pkg-config files # Install pkg-config files
@ -419,4 +513,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
install(EXPORT "${PN}${SUFFIX64}Targets" install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::" NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR}) DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@ -197,3 +197,7 @@ In chronological order:
* River Dillon <oss@outerpassage.net> * River Dillon <oss@outerpassage.net>
* [2021-07-10] fix compilation with musl libc * [2021-07-10] fix compilation with musl libc
* Bine Brank <https://github.com/binebrank>
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM

View File

@ -1,4 +1,51 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.3.19
19-Dec-2021
general:
- reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16
- fixed a potential thread race in the thread buffer reallocation routines
that were introduced in 0.3.18
- fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE
- fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG
- made automatic library suffix for CMAKE builds with INTERFACE64 available
to CBLAS-only builds
x86_64:
- DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities
when an unknown CPUID is encountered, instead of defaulting to Prescott
- added cpu detection for Intel Alder Lake
- added cpu detection for Intel Sapphire Rapids
- added an optimized SBGEMM kernel for Sapphire Rapids
- fixed DYNAMIC_ARCH builds on OSX with CMAKE
- worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX
- fixed missing thread initialization for static builds on Windows/MSVC
- fixed an excessive read in ZSYMV
POWER:
- added support for POWER10 in big-endian mode
- added support for building with CMAKE
- added optimized SGEMM and DGEMM kernels for small matrix sizes
ARMV8:
- added basic support and cputype detection for Fujitsu A64FX
- added a generic ARMV8SVE target
- added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX
- added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus
- fixed cpuid detection for Apple M1 and improved performance
- improved compiler flag setting in CMAKE builds
RISCV64:
- fixed improper initialization in CSCAL/ZSCAL for strided access patterns
MIPS:
- added a GENERIC target for MIPS32
- added support for cross-compiling to MIPS32 on x86_64 using CMAKE
MIPS64:
- fixed misdetection of MSA capability
==================================================================== ====================================================================
Version 0.3.18 Version 0.3.18
02-Oct-2021 02-Oct-2021

View File

@ -32,7 +32,7 @@ export NOFORTRAN
export NO_LAPACK export NO_LAPACK
endif endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test

View File

@ -1,6 +1,9 @@
ifneq ($(C_COMPILER), PGI) ifneq ($(C_COMPILER), PGI)
ifneq ($(GCCVERSIONGT4), 1) ifeq ($(C_COMPILER), CLANG)
ISCLANG=1
endif
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
CCOMMON_OPT += -march=armv8-a CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a
@ -17,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a
endif endif
endif endif
ifeq ($(CORE), ARMV8SVE)
CCOMMON_OPT += -march=armv8-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a+sve
endif
endif
ifeq ($(CORE), CORTEXA53) ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
@ -48,7 +58,7 @@ endif
# Use a72 tunings because Neoverse-N1 is only available # Use a72 tunings because Neoverse-N1 is only available
# in GCC>=9 # in GCC>=9
ifeq ($(CORE), NEOVERSEN1) ifeq ($(CORE), NEOVERSEN1)
ifeq ($(GCCVERSIONGTEQ7), 1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(GCCVERSIONGTEQ9), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
@ -70,7 +80,7 @@ endif
# Use a53 tunings because a55 is only available in GCC>=8.1 # Use a53 tunings because a55 is only available in GCC>=8.1
ifeq ($(CORE), CORTEXA55) ifeq ($(CORE), CORTEXA55)
ifeq ($(GCCVERSIONGTEQ7), 1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ8), 1) ifeq ($(GCCVERSIONGTEQ8), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
@ -132,7 +142,7 @@ FCOMMON_OPT += -march=armv8.3-a
endif endif
endif endif
ifeq ($(GCCVERSIONGTEQ9), 1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
ifeq ($(CORE), TSV110) ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
@ -150,6 +160,15 @@ endif
endif endif
endif endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), A64FX)
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
endif
endif
endif
endif endif
endif endif

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.3.18 VERSION = 0.3.18.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -9,11 +9,10 @@ ifndef TOPDIR
TOPDIR = . TOPDIR = .
endif endif
# If ARCH is not set, we use the host system's architecture for getarch compile options. # we need to use the host system's architecture for getarch compile options even especially when cross-compiling
ifndef ARCH
HOSTARCH := $(shell uname -m) HOSTARCH := $(shell uname -m)
else ifeq ($(HOSTARCH), amd64)
HOSTARCH = $(ARCH) HOSTARCH=x86_64
endif endif
# Catch conflicting usage of ARCH in some BSD environments # Catch conflicting usage of ARCH in some BSD environments
@ -102,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET
ifeq ($(TARGET), GENERIC) ifeq ($(TARGET), GENERIC)
ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(DYNAMIC_ARCH), 1)
override NO_EXPRECISION=1 override NO_EXPRECISION=1
export NO_EXPRECiSION export NO_EXPRECISION
endif endif
endif endif
endif endif
@ -119,6 +118,9 @@ endif
ifeq ($(TARGET), COOPERLAKE) ifeq ($(TARGET), COOPERLAKE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET), SAPPHIRERAPIDS)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE) ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -143,8 +145,13 @@ endif
ifeq ($(TARGET), POWER8) ifeq ($(TARGET), POWER8)
GETARCH_FLAGS := -DFORCE_POWER6 GETARCH_FLAGS := -DFORCE_POWER6
endif endif
ifeq ($(TARGET), POWER9)
GETARCH_FLAGS := -DFORCE_POWER6
endif
ifeq ($(TARGET), POWER10)
GETARCH_FLAGS := -DFORCE_POWER6
endif
endif endif
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
# #
@ -164,6 +171,9 @@ endif
ifeq ($(TARGET_CORE), COOPERLAKE) ifeq ($(TARGET_CORE), COOPERLAKE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE) ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -251,6 +261,8 @@ endif
#For small matrix optimization #For small matrix optimization
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
SMALL_MATRIX_OPT = 1 SMALL_MATRIX_OPT = 1
else ifeq ($(CORE), POWER10)
SMALL_MATRIX_OPT = 1
endif endif
ifeq ($(SMALL_MATRIX_OPT), 1) ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT CCOMMON_OPT += -DSMALL_MATRIX_OPT
@ -260,6 +272,10 @@ endif
ifndef GOTOBLAS_MAKEFILE ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1
# Determine if the assembler is GNU Assembler
HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
# Generating Makefile.conf and config.h # Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
@ -307,7 +323,7 @@ else
SMP = 1 SMP = 1
endif endif
else else
ifeq ($(NUM_THREAD), 1) ifeq ($(NUM_THREADS), 1)
SMP = SMP =
else else
SMP = 1 SMP = 1
@ -892,15 +908,25 @@ endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
NEWPGI := 1 NEWPGI := 1
PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
NEWPGI2 := 1
endif
endif endif
ifdef BINARY64 ifdef BINARY64
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
ifneq ($(NEWPGI2),1)
CCOMMON_OPT += -tp p7-64 CCOMMON_OPT += -tp p7-64
else
CCOMMON_OPT += -tp px
endif
ifneq ($(NEWPGI),1) ifneq ($(NEWPGI),1)
CCOMMON_OPT += -D__MMX__ -Mnollvm CCOMMON_OPT += -D__MMX__ -Mnollvm
endif endif
@ -915,7 +941,11 @@ endif
endif endif
endif endif
else else
ifneq ($(NEWPGI2),1)
CCOMMON_OPT += -tp p7 CCOMMON_OPT += -tp p7
else
CCOMMON_OPT += -tp px
endif
endif endif
endif endif
@ -1092,8 +1122,12 @@ FCOMMON_OPT += -i8
endif endif
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
ifneq ($(NEWPGI2),1)
FCOMMON_OPT += -tp p7-64 FCOMMON_OPT += -tp p7-64
else else
FCOMMON_OPT += -tp px
endif
else
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
ifeq ($(CORE), POWER6) ifeq ($(CORE), POWER6)
$(warning NVIDIA HPC compilers do not support POWER6.) $(warning NVIDIA HPC compilers do not support POWER6.)
@ -1643,8 +1677,10 @@ export HAVE_VFP
export HAVE_VFPV3 export HAVE_VFPV3
export HAVE_VFPV4 export HAVE_VFPV4
export HAVE_NEON export HAVE_NEON
export HAVE_MSA ifndef NO_MSA
export MSA_FLAGS export HAVE_MSA
export MSA_FLAGS
endif
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE

View File

@ -81,6 +81,40 @@ CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake
endif endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
ifeq ($(CORE), SAPPHIRERAPIDS)
ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC)
# sapphire rapids support was added in 11
ifeq ($(GCCVERSIONGTEQ11), 1)
CCOMMON_OPT += -march=sapphirerapids
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=sapphirerapids
endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif endif
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)

View File

@ -23,6 +23,7 @@ HASWELL
SKYLAKEX SKYLAKEX
ATOM ATOM
COOPERLAKE COOPERLAKE
SAPPHIRERAPIDS
b)AMD CPU: b)AMD CPU:
ATHLON ATHLON

View File

@ -29,15 +29,15 @@ environment:
global: global:
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
matrix: matrix:
- COMPILER: clang-cl # - COMPILER: clang-cl
WITH_FORTRAN: ON # WITH_FORTRAN: ON
- COMPILER: clang-cl # - COMPILER: clang-cl
DYNAMIC_ARCH: ON # DYNAMIC_ARCH: ON
WITH_FORTRAN: OFF # WITH_FORTRAN: OFF
- COMPILER: cl # - COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw # - COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF # DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore # WITH_FORTRAN: ignore
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-6.3.0-32 COMPILER: MinGW-gcc-6.3.0-32
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
@ -46,6 +46,7 @@ environment:
install: install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda update --yes -n base conda
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
@ -64,8 +65,8 @@ before_build:
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..

View File

@ -76,6 +76,49 @@ jobs:
dir dir
openblas_utest.exe openblas_utest.exe
- job: Windows_mingw_gmake
pool:
vmImage: 'windows-latest'
steps:
- script: |
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL"
- job: Windows_clang_cmake
pool:
vmImage: 'windows-latest'
steps:
- script: |
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
set "LIB=C:\Miniconda\Library\lib;%LIB%"
set "CPATH=C:\Miniconda\Library\include;%CPATH%
conda config --add channels conda-forge --force
conda config --set auto_update_conda false
conda install --yes ninja
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
mkdir build
cd build
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
- job: Windows_flang_clang
pool:
vmImage: 'windows-latest'
steps:
- script: |
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
set "LIB=C:\Miniconda\Library\lib;%LIB%"
set "CPATH=C:\Miniconda\Library\include;%CPATH%"
conda config --add channels conda-forge --force
conda config --set auto_update_conda false
conda install --yes --quiet ninja flang
mkdir build
cd build
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
- job: OSX_OpenMP - job: OSX_OpenMP
pool: pool:
vmImage: 'macOS-10.15' vmImage: 'macOS-10.15'
@ -122,7 +165,7 @@ jobs:
make make
ctest ctest
- job: OSX_OpenMP_Clang_gf_cmake - job: OSX_dynarch_cmake
pool: pool:
vmImage: 'macOS-10.15' vmImage: 'macOS-10.15'
variables: variables:
@ -130,12 +173,10 @@ jobs:
LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib
steps: steps:
- script: | - script: |
brew update
brew install llvm libomp
mkdir build mkdir build
cd build cd build
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
make cmake --build .
ctest ctest
- job: OSX_Ifort_Clang - job: OSX_Ifort_Clang
@ -179,7 +220,7 @@ jobs:
brew update brew update
brew install --cask android-ndk brew install --cask android-ndk
export ANDROID_NDK_HOME=/usr/local/share/android-ndk export ANDROID_NDK_HOME=/usr/local/share/android-ndk
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
- job: OSX_IOS_ARMV8 - job: OSX_IOS_ARMV8
pool: pool:
@ -206,9 +247,9 @@ jobs:
vmImage: 'ubuntu-latest' vmImage: 'ubuntu-latest'
steps: steps:
- script: | - script: |
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \
|| exit 1 || exit 1
alpine() { /alpine/enter-chroot -u "$USER" "$@"; } alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
alpine make DYNAMIC_ARCH=1 BINARY=64 alpine make DYNAMIC_ARCH=1 BINARY=64

View File

@ -125,7 +125,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){ for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){ for(i = 0; i < n * COMPSIZE; i++){
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
} }
@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){ for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){ for(i = 0; i < n * COMPSIZE; i++){
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
} }

View File

@ -109,7 +109,7 @@ if (${ARCH} STREQUAL "ia64")
endif () endif ()
endif () endif ()
if (MIPS64) if (MIPS32 OR MIPS64)
set(NO_BINARY_MODE 1) set(NO_BINARY_MODE 1)
endif () endif ()

View File

@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
if (NO_BINARY_MODE) if (NO_BINARY_MODE)
if (MIPS32)
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32")
set(BINARY_DEFINED 1)
endif ()
if (MIPS64) if (MIPS64)
if (BINARY64) if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64")
@ -126,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE)
endif () endif ()
endif () endif ()
if (${CORE} STREQUAL SAPPHIRERAPIDS)
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
endif()
endif ()
endif ()
endif ()
if (${CORE} STREQUAL A64FX)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif ()
if (${CORE} STREQUAL ARMV8SVE)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL POWER10)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
else ()
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." )
endif()
endif ()
endif ()
if (${CORE} STREQUAL POWER9)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.")
endif ()
endif ()
endif ()
if (${CORE} STREQUAL POWER8)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
endif ()
endif ()
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
if (HAVE_AVX2) if (HAVE_AVX2)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")

View File

@ -3,11 +3,6 @@
## Description: Ported from portion of OpenBLAS/Makefile.system ## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables. ## Sets Fortran related variables.
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (${F_COMPILER} STREQUAL "FLANG") if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64) if (BINARY64 AND INTERFACE64)

View File

@ -1,214 +1,218 @@
# helper functions for the kernel CMakeLists.txt # helper functions for the kernel CMakeLists.txt
function(SetFallback KERNEL SOURCE_PATH)
if (NOT (DEFINED ${KERNEL}))
set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE)
endif ()
endfunction()
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1) macro(SetDefaultL1)
set(SAMAXKERNEL amax.S) SetFallback(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S) SetFallback(DAMAXKERNEL amax.S)
set(QAMAXKERNEL amax.S) SetFallback(QAMAXKERNEL amax.S)
set(CAMAXKERNEL zamax.S) SetFallback(CAMAXKERNEL zamax.S)
set(ZAMAXKERNEL zamax.S) SetFallback(ZAMAXKERNEL zamax.S)
set(XAMAXKERNEL zamax.S) SetFallback(XAMAXKERNEL zamax.S)
set(SAMINKERNEL amin.S) SetFallback(SAMINKERNEL amin.S)
set(DAMINKERNEL amin.S) SetFallback(DAMINKERNEL amin.S)
set(QAMINKERNEL amin.S) SetFallback(QAMINKERNEL amin.S)
set(CAMINKERNEL zamin.S) SetFallback(CAMINKERNEL zamin.S)
set(ZAMINKERNEL zamin.S) SetFallback(ZAMINKERNEL zamin.S)
set(XAMINKERNEL zamin.S) SetFallback(XAMINKERNEL zamin.S)
set(SMAXKERNEL max.S) SetFallback(SMAXKERNEL max.S)
set(DMAXKERNEL max.S) SetFallback(DMAXKERNEL max.S)
set(QMAXKERNEL max.S) SetFallback(QMAXKERNEL max.S)
set(SMINKERNEL min.S) SetFallback(SMINKERNEL min.S)
set(DMINKERNEL min.S) SetFallback(DMINKERNEL min.S)
set(QMINKERNEL min.S) SetFallback(QMINKERNEL min.S)
set(ISAMAXKERNEL iamax.S) SetFallback(ISAMAXKERNEL iamax.S)
set(IDAMAXKERNEL iamax.S) SetFallback(IDAMAXKERNEL iamax.S)
set(IQAMAXKERNEL iamax.S) SetFallback(IQAMAXKERNEL iamax.S)
set(ICAMAXKERNEL izamax.S) SetFallback(ICAMAXKERNEL izamax.S)
set(IZAMAXKERNEL izamax.S) SetFallback(IZAMAXKERNEL izamax.S)
set(IXAMAXKERNEL izamax.S) SetFallback(IXAMAXKERNEL izamax.S)
set(ISAMINKERNEL iamin.S) SetFallback(ISAMINKERNEL iamin.S)
set(IDAMINKERNEL iamin.S) SetFallback(IDAMINKERNEL iamin.S)
set(IQAMINKERNEL iamin.S) SetFallback(IQAMINKERNEL iamin.S)
set(ICAMINKERNEL izamin.S) SetFallback(ICAMINKERNEL izamin.S)
set(IZAMINKERNEL izamin.S) SetFallback(IZAMINKERNEL izamin.S)
set(IXAMINKERNEL izamin.S) SetFallback(IXAMINKERNEL izamin.S)
set(ISMAXKERNEL iamax.S) SetFallback(ISMAXKERNEL iamax.S)
set(IDMAXKERNEL iamax.S) SetFallback(IDMAXKERNEL iamax.S)
set(IQMAXKERNEL iamax.S) SetFallback(IQMAXKERNEL iamax.S)
set(ISMINKERNEL iamin.S) SetFallback(ISMINKERNEL iamin.S)
set(IDMINKERNEL iamin.S) SetFallback(IDMINKERNEL iamin.S)
set(IQMINKERNEL iamin.S) SetFallback(IQMINKERNEL iamin.S)
set(SASUMKERNEL asum.S) SetFallback(SASUMKERNEL asum.S)
set(DASUMKERNEL asum.S) SetFallback(DASUMKERNEL asum.S)
set(CASUMKERNEL zasum.S) SetFallback(CASUMKERNEL zasum.S)
set(ZASUMKERNEL zasum.S) SetFallback(ZASUMKERNEL zasum.S)
set(QASUMKERNEL asum.S) SetFallback(QASUMKERNEL asum.S)
set(XASUMKERNEL zasum.S) SetFallback(XASUMKERNEL zasum.S)
set(SAXPYKERNEL axpy.S) SetFallback(SAXPYKERNEL axpy.S)
set(DAXPYKERNEL axpy.S) SetFallback(DAXPYKERNEL axpy.S)
set(CAXPYKERNEL zaxpy.S) SetFallback(CAXPYKERNEL zaxpy.S)
set(ZAXPYKERNEL zaxpy.S) SetFallback(ZAXPYKERNEL zaxpy.S)
set(QAXPYKERNEL axpy.S) SetFallback(QAXPYKERNEL axpy.S)
set(XAXPYKERNEL zaxpy.S) SetFallback(XAXPYKERNEL zaxpy.S)
set(SCOPYKERNEL copy.S) SetFallback(SCOPYKERNEL copy.S)
set(DCOPYKERNEL copy.S) SetFallback(DCOPYKERNEL copy.S)
set(CCOPYKERNEL zcopy.S) SetFallback(CCOPYKERNEL zcopy.S)
set(ZCOPYKERNEL zcopy.S) SetFallback(ZCOPYKERNEL zcopy.S)
set(QCOPYKERNEL copy.S) SetFallback(QCOPYKERNEL copy.S)
set(XCOPYKERNEL zcopy.S) SetFallback(XCOPYKERNEL zcopy.S)
set(SDOTKERNEL dot.S) SetFallback(SDOTKERNEL dot.S)
set(DDOTKERNEL dot.S) SetFallback(DDOTKERNEL dot.S)
set(CDOTKERNEL zdot.S) SetFallback(CDOTKERNEL zdot.S)
set(ZDOTKERNEL zdot.S) SetFallback(ZDOTKERNEL zdot.S)
set(QDOTKERNEL dot.S) SetFallback(QDOTKERNEL dot.S)
set(XDOTKERNEL zdot.S) SetFallback(XDOTKERNEL zdot.S)
set(SNRM2KERNEL nrm2.S) SetFallback(SNRM2KERNEL nrm2.S)
set(DNRM2KERNEL nrm2.S) SetFallback(DNRM2KERNEL nrm2.S)
set(QNRM2KERNEL nrm2.S) SetFallback(QNRM2KERNEL nrm2.S)
set(CNRM2KERNEL znrm2.S) SetFallback(CNRM2KERNEL znrm2.S)
set(ZNRM2KERNEL znrm2.S) SetFallback(ZNRM2KERNEL znrm2.S)
set(XNRM2KERNEL znrm2.S) SetFallback(XNRM2KERNEL znrm2.S)
set(SROTKERNEL rot.S) SetFallback(SROTKERNEL rot.S)
set(DROTKERNEL rot.S) SetFallback(DROTKERNEL rot.S)
set(QROTKERNEL rot.S) SetFallback(QROTKERNEL rot.S)
set(CROTKERNEL zrot.S) SetFallback(CROTKERNEL zrot.S)
set(ZROTKERNEL zrot.S) SetFallback(ZROTKERNEL zrot.S)
set(XROTKERNEL zrot.S) SetFallback(XROTKERNEL zrot.S)
set(SSCALKERNEL scal.S) SetFallback(SSCALKERNEL scal.S)
set(DSCALKERNEL scal.S) SetFallback(DSCALKERNEL scal.S)
set(CSCALKERNEL zscal.S) SetFallback(CSCALKERNEL zscal.S)
set(ZSCALKERNEL zscal.S) SetFallback(ZSCALKERNEL zscal.S)
set(QSCALKERNEL scal.S) SetFallback(QSCALKERNEL scal.S)
set(XSCALKERNEL zscal.S) SetFallback(XSCALKERNEL zscal.S)
set(SSWAPKERNEL swap.S) SetFallback(SSWAPKERNEL swap.S)
set(DSWAPKERNEL swap.S) SetFallback(DSWAPKERNEL swap.S)
set(CSWAPKERNEL zswap.S) SetFallback(CSWAPKERNEL zswap.S)
set(ZSWAPKERNEL zswap.S) SetFallback(ZSWAPKERNEL zswap.S)
set(QSWAPKERNEL swap.S) SetFallback(QSWAPKERNEL swap.S)
set(XSWAPKERNEL zswap.S) SetFallback(XSWAPKERNEL zswap.S)
set(SGEMVNKERNEL gemv_n.S) SetFallback(SGEMVNKERNEL gemv_n.S)
set(SGEMVTKERNEL gemv_t.S) SetFallback(SGEMVTKERNEL gemv_t.S)
set(DGEMVNKERNEL gemv_n.S) SetFallback(DGEMVNKERNEL gemv_n.S)
set(DGEMVTKERNEL gemv_t.S) SetFallback(DGEMVTKERNEL gemv_t.S)
set(CGEMVNKERNEL zgemv_n.S) SetFallback(CGEMVNKERNEL zgemv_n.S)
set(CGEMVTKERNEL zgemv_t.S) SetFallback(CGEMVTKERNEL zgemv_t.S)
set(ZGEMVNKERNEL zgemv_n.S) SetFallback(ZGEMVNKERNEL zgemv_n.S)
set(ZGEMVTKERNEL zgemv_t.S) SetFallback(ZGEMVTKERNEL zgemv_t.S)
set(QGEMVNKERNEL gemv_n.S) SetFallback(QGEMVNKERNEL gemv_n.S)
set(QGEMVTKERNEL gemv_t.S) SetFallback(QGEMVTKERNEL gemv_t.S)
set(XGEMVNKERNEL zgemv_n.S) SetFallback(XGEMVNKERNEL zgemv_n.S)
set(XGEMVTKERNEL zgemv_t.S) SetFallback(XGEMVTKERNEL zgemv_t.S)
set(SCABS_KERNEL ../generic/cabs.c) SetFallback(SCABS_KERNEL ../generic/cabs.c)
set(DCABS_KERNEL ../generic/cabs.c) SetFallback(DCABS_KERNEL ../generic/cabs.c)
set(QCABS_KERNEL ../generic/cabs.c) SetFallback(QCABS_KERNEL ../generic/cabs.c)
set(LSAME_KERNEL ../generic/lsame.c) SetFallback(LSAME_KERNEL ../generic/lsame.c)
set(SAXPBYKERNEL ../arm/axpby.c) SetFallback(SAXPBYKERNEL ../arm/axpby.c)
set(DAXPBYKERNEL ../arm/axpby.c) SetFallback(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c) SetFallback(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c) SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S) SetFallback(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S) SetFallback(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S) SetFallback(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S) SetFallback(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S) SetFallback(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S) SetFallback(XSUMKERNEL zsum.S)
if (BUILD_BFLOAT16) if (BUILD_BFLOAT16)
set(SHAMINKERNEL ../arm/amin.c) SetFallback(SHAMINKERNEL ../arm/amin.c)
set(SHAMAXKERNEL ../arm/amax.c) SetFallback(SHAMAXKERNEL ../arm/amax.c)
set(SHMAXKERNEL ../arm/max.c) SetFallback(SHMAXKERNEL ../arm/max.c)
set(SHMINKERNEL ../arm/min.c) SetFallback(SHMINKERNEL ../arm/min.c)
set(ISHAMAXKERNEL ../arm/iamax.c) SetFallback(ISHAMAXKERNEL ../arm/iamax.c)
set(ISHAMINKERNEL ../arm/iamin.c) SetFallback(ISHAMINKERNEL ../arm/iamin.c)
set(ISHMAXKERNEL ../arm/imax.c) SetFallback(ISHMAXKERNEL ../arm/imax.c)
set(ISHMINKERNEL ../arm/imin.c) SetFallback(ISHMINKERNEL ../arm/imin.c)
set(SHASUMKERNEL ../arm/asum.c) SetFallback(SHASUMKERNEL ../arm/asum.c)
set(SHAXPYKERNEL ../arm/axpy.c) SetFallback(SHAXPYKERNEL ../arm/axpy.c)
set(SHAXPBYKERNEL ../arm/axpby.c) SetFallback(SHAXPBYKERNEL ../arm/axpby.c)
set(SHCOPYKERNEL ../arm/copy.c) SetFallback(SHCOPYKERNEL ../arm/copy.c)
set(SBDOTKERNEL ../x86_64/sbdot.c) SetFallback(SBDOTKERNEL ../x86_64/sbdot.c)
set(SHROTKERNEL ../arm/rot.c) SetFallback(SHROTKERNEL ../arm/rot.c)
set(SHSCALKERNEL ../arm/scal.c) SetFallback(SHSCALKERNEL ../arm/scal.c)
set(SHNRM2KERNEL ../arm/nrm2.c) SetFallback(SHNRM2KERNEL ../arm/nrm2.c)
set(SHSUMKERNEL ../arm/sum.c) SetFallback(SHSUMKERNEL ../arm/sum.c)
set(SHSWAPKERNEL ../arm/swap.c) SetFallback(SHSWAPKERNEL ../arm/swap.c)
set(TOBF16KERNEL ../x86_64/tobf16.c) SetFallback(TOBF16KERNEL ../x86_64/tobf16.c)
set(BF16TOKERNEL ../x86_64/bf16to.c) SetFallback(BF16TOKERNEL ../x86_64/bf16to.c)
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
endif () endif ()
endmacro () endmacro ()
macro(SetDefaultL2) macro(SetDefaultL2)
set(SGEMVNKERNEL ../arm/gemv_n.c) SetFallback(SGEMVNKERNEL ../arm/gemv_n.c)
set(SGEMVTKERNEL ../arm/gemv_t.c) SetFallback(SGEMVTKERNEL ../arm/gemv_t.c)
set(DGEMVNKERNEL gemv_n.S) SetFallback(DGEMVNKERNEL gemv_n.S)
set(DGEMVTKERNEL gemv_t.S) SetFallback(DGEMVTKERNEL gemv_t.S)
set(CGEMVNKERNEL zgemv_n.S) SetFallback(CGEMVNKERNEL zgemv_n.S)
set(CGEMVTKERNEL zgemv_t.S) SetFallback(CGEMVTKERNEL zgemv_t.S)
set(ZGEMVNKERNEL zgemv_n.S) SetFallback(ZGEMVNKERNEL zgemv_n.S)
set(ZGEMVTKERNEL zgemv_t.S) SetFallback(ZGEMVTKERNEL zgemv_t.S)
set(QGEMVNKERNEL gemv_n.S) SetFallback(QGEMVNKERNEL gemv_n.S)
set(QGEMVTKERNEL gemv_t.S) SetFallback(QGEMVTKERNEL gemv_t.S)
set(XGEMVNKERNEL zgemv_n.S) SetFallback(XGEMVNKERNEL zgemv_n.S)
set(XGEMVTKERNEL zgemv_t.S) SetFallback(XGEMVTKERNEL zgemv_t.S)
set(SGERKERNEL ../generic/ger.c) SetFallback(SGERKERNEL ../generic/ger.c)
set(DGERKERNEL ../generic/ger.c) SetFallback(DGERKERNEL ../generic/ger.c)
set(QGERKERNEL ../generic/ger.c) SetFallback(QGERKERNEL ../generic/ger.c)
set(CGERUKERNEL ../generic/zger.c) SetFallback(CGERUKERNEL ../generic/zger.c)
set(CGERCKERNEL ../generic/zger.c) SetFallback(CGERCKERNEL ../generic/zger.c)
set(ZGERUKERNEL ../generic/zger.c) SetFallback(ZGERUKERNEL ../generic/zger.c)
set(ZGERCKERNEL ../generic/zger.c) SetFallback(ZGERCKERNEL ../generic/zger.c)
set(XGERUKERNEL ../generic/zger.c) SetFallback(XGERUKERNEL ../generic/zger.c)
set(XGERCKERNEL ../generic/zger.c) SetFallback(XGERCKERNEL ../generic/zger.c)
set(SSYMV_U_KERNEL ../generic/symv_k.c) SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c)
set(SSYMV_L_KERNEL ../generic/symv_k.c) SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c)
set(DSYMV_U_KERNEL ../generic/symv_k.c) SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c)
set(DSYMV_L_KERNEL ../generic/symv_k.c) SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c)
set(QSYMV_U_KERNEL ../generic/symv_k.c) SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c)
set(QSYMV_L_KERNEL ../generic/symv_k.c) SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c)
set(CSYMV_U_KERNEL ../generic/zsymv_k.c) SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c)
set(CSYMV_L_KERNEL ../generic/zsymv_k.c) SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c)
set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
set(XSYMV_U_KERNEL ../generic/zsymv_k.c) SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c)
set(XSYMV_L_KERNEL ../generic/zsymv_k.c) SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c)
set(CHEMV_U_KERNEL ../generic/zhemv_k.c) SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c)
set(CHEMV_L_KERNEL ../generic/zhemv_k.c) SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c)
set(CHEMV_V_KERNEL ../generic/zhemv_k.c) SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c)
set(CHEMV_M_KERNEL ../generic/zhemv_k.c) SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
set(XHEMV_U_KERNEL ../generic/zhemv_k.c) SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c)
set(XHEMV_L_KERNEL ../generic/zhemv_k.c) SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c)
set(XHEMV_V_KERNEL ../generic/zhemv_k.c) SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c)
set(XHEMV_M_KERNEL ../generic/zhemv_k.c) SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c)
if (BUILD_BFLOAT16) if (BUILD_BFLOAT16)
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
set(SHGERKERNEL ../generic/ger.c) SetFallback(SHGERKERNEL ../generic/ger.c)
endif () endif ()
endmacro () endmacro ()
macro(SetDefaultL3) macro(SetDefaultL3)
set(SGEADD_KERNEL ../generic/geadd.c) SetFallback(SGEADD_KERNEL ../generic/geadd.c)
set(DGEADD_KERNEL ../generic/geadd.c) SetFallback(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c) SetFallback(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c) SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c)
if (BUILD_BFLOAT16) if (BUILD_BFLOAT16)
set(SHGEADD_KERNEL ../generic/geadd.c) SetFallback(SHGEADD_KERNEL ../generic/geadd.c)
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
set(SBGEMM_BETA ../generic/gemm_beta.c) SetFallback(SBGEMM_BETA ../generic/gemm_beta.c)
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
set(SBGEMMINCOPYOBJ sbgemm_incopy.o) SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o)
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
endif () endif ()
endmacro () endmacro ()

View File

@ -416,7 +416,7 @@ endif ()
set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16) set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "VORTEX") elseif ("${TCORE}" STREQUAL "VORTEX")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n" "#define ARMV8\n"
"#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_SIZE\t32768\n"
@ -439,6 +439,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX")
set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16) set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "P5600")
file(APPEND ${TARGET_CONF_TEMP}
"#define L2_SIZE 1048576\n"
"#define DTB_SIZE 4096\n"
"#define DTB_DEFAULT_ENTRIES 64\n")
set(SGEMM_UNROLL_M 2)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" MATCHES "MIPS")
file(APPEND ${TARGET_CONF_TEMP}
"#define L2_SIZE 262144\n"
"#define DTB_SIZE 4096\n"
"#define DTB_DEFAULT_ENTRIES 64\n")
set(SGEMM_UNROLL_M 2)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6") elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n" "#define L1_DATA_SIZE 32768\n"

View File

@ -33,7 +33,7 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.") message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1) set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS")
set(TARGET "NEHALEM") set(TARGET "NEHALEM")
endif () endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
@ -42,6 +42,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
set(TARGET "ARMV7") set(TARGET "ARMV7")
endif () endif ()
if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10")
set(TARGET "POWER6")
endif ()
endif () endif ()
@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") set(GETARCH_FLAGS "${GETARCH_FLAGS} -static")
endif () endif ()
if (POWER)
set(NO_WARMUP 1)
set(HAVE_GAS 1)
if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU")
set(HAVE_GAS 0)
elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as")
set(HAVE_GAS 0)
endif ()
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}")
endif ()
#if don't use Fortran, it will only compile CBLAS. #if don't use Fortran, it will only compile CBLAS.
if (ONLY_CBLAS) if (ONLY_CBLAS)
set(NO_LAPACK 1) set(NO_LAPACK 1)
@ -163,6 +178,22 @@ if (DEFINED TARGET)
endif() endif()
endif() endif()
endif() endif()
if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
endif()
endif()
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif() endif()
@ -206,6 +237,27 @@ if (DEFINED TARGET)
if (DEFINED HAVE_SSE4_1) if (DEFINED HAVE_SSE4_1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
endif() endif()
if (${TARGET} STREQUAL POWER10)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
else ()
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.")
endif()
endif()
if (${TARGET} STREQUAL POWER9)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
else ()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.")
endif()
endif()
if (${TARGET} STREQUAL POWER8)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
endif()
endif() endif()
if (DEFINED BINARY) if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.") message(STATUS "Compiling a ${BINARY}-bit binary.")
@ -223,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
# C Compiler dependent settings # C Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (NOT NOFORTRAN) if (NOT NOFORTRAN)
# Fortran Compiler dependent settings # Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
@ -258,7 +315,7 @@ if (NEED_PIC)
endif() endif()
endif () endif ()
if (X86_64) if (X86_64 OR ${CORE} STREQUAL POWER10)
set(SMALL_MATRIX_OPT TRUE) set(SMALL_MATRIX_OPT TRUE)
endif () endif ()
if (SMALL_MATRIX_OPT) if (SMALL_MATRIX_OPT)
@ -266,7 +323,7 @@ if (SMALL_MATRIX_OPT)
endif () endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR PPC) if (X86 OR X86_64 OR ARM64 OR POWER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER) if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")

View File

@ -20,11 +20,11 @@ endif()
if(CMAKE_COMPILER_IS_GNUCC AND WIN32) if(MINGW)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE
OUTPUT_STRIP_TRAILING_WHITESPACE) OUTPUT_STRIP_TRAILING_WHITESPACE)
if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
set(MINGW64 1) set(MINGW64 1)
endif() endif()
endif() endif()
@ -35,7 +35,7 @@ if(CMAKE_CL_64 OR MINGW64)
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
set(X86 1) set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
set(PPC 1) set(POWER 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1) set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
@ -73,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING})
else () else ()
set(X86 1) set(X86 1)
endif() endif()
elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*")
set(MIPS32 1)
elseif (${TARGET} STREQUAL "ARMV7") elseif (${TARGET} STREQUAL "ARMV7")
set(ARM 1) set(ARM 1)
else() else()
@ -86,8 +88,12 @@ if (X86_64)
set(ARCH "x86_64") set(ARCH "x86_64")
elseif(X86) elseif(X86)
set(ARCH "x86") set(ARCH "x86")
elseif(PPC) elseif(POWER)
set(ARCH "power") set(ARCH "power")
elseif(MIPS32)
set(ARCH "mips")
elseif(MIPS64)
set(ARCH "mips64")
elseif(ARM) elseif(ARM)
set(ARCH "arm") set(ARCH "arm")
elseif(ARM64) elseif(ARM64)
@ -97,7 +103,7 @@ else()
endif () endif ()
if (NOT BINARY) if (NOT BINARY)
if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
set(BINARY 64) set(BINARY 64)
else () else ()
set(BINARY 32) set(BINARY 32)

View File

@ -15,35 +15,83 @@ endfunction ()
# Reads a Makefile into CMake vars. # Reads a Makefile into CMake vars.
macro(ParseMakefileVars MAKEFILE_IN) macro(ParseMakefileVars MAKEFILE_IN)
message(STATUS "Reading vars from ${MAKEFILE_IN}...") message(STATUS "Reading vars from ${MAKEFILE_IN}...")
set (IfElse 0) set (C_COMPILER ${CMAKE_C_COMPILER_ID})
set (ElseSeen 0) set (IfElse 0)
set (ElseSeen 0)
set (SkipIfs 0)
set (SkipElse 0)
file(STRINGS ${MAKEFILE_IN} makefile_contents) file(STRINGS ${MAKEFILE_IN} makefile_contents)
foreach (makefile_line ${makefile_contents}) foreach (makefile_line ${makefile_contents})
#message(STATUS "parsing ${makefile_line}") #message(STATUS "parsing ${makefile_line}")
if (${IfElse} GREATER 0) # Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition.
# The variable SkipIfs is used to identify which endif statement closes the scope of the else statement.
if (${SkipElse} EQUAL 1)
#message(STATUS "skipping ${makefile_line}")
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
MATH(EXPR SkipIfs "${SkipIfs}+1")
endif ()
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "") if (NOT "${line_match}" STREQUAL "")
# message(STATUS "ENDIF ${makefile_line}") if (${SkipIfs} EQUAL 0)
set (IfElse 0) set (SkipElse 0)
set (ElseSeen 0) else ()
MATH(EXPR SkipIfs "${SkipIfs}-1")
endif ()
endif ()
continue ()
endif ()
# The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement.
if (${IfElse} GREATER 0)
# If the current scope is the one that has to be skipped, the if/endif/else statements
# along with it till the endif that closes the current scope have to be ignored as well.
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
#message(STATUS "skipping ${makefile_line}")
MATH(EXPR SkipIfs "${SkipIfs}+1")
continue ()
endif ()
endif ()
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
if (${SkipIfs} EQUAL 0)
#message(STATUS "ENDIF ${makefile_line}")
set (IfElse 0)
set (ElseSeen 0)
else ()
#message(STATUS "skipping ${makefile_line}")
MATH(EXPR SkipIfs "${SkipIfs}-1")
endif ()
continue () continue ()
endif () endif ()
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "") if (NOT "${line_match}" STREQUAL "")
# message(STATUS "ELSE ${makefile_line}") if (${SkipIfs} EQUAL 0)
set (ElseSeen 1) #message(STATUS "ELSE ${makefile_line}")
continue () set (ElseSeen 1)
endif() else ()
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) #message(STATUS "skipping ${makefile_line}")
# message(STATUS "skipping ${makefile_line}") endif ()
continue () continue ()
endif()
# Skip the lines that are not part of the path that has to be taken.
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0))
#message(STATUS "skipping ${makefile_line}")
continue ()
endif () endif ()
endif () endif ()
# Skip commented lines (the ones that start with '#')
string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "skipping ${makefile_line}")
continue ()
endif ()
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "") if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on ${line_match}") #message(STATUS "match on ${line_match}")
set(var_name ${CMAKE_MATCH_1}) set(var_name ${CMAKE_MATCH_1})
# set(var_value ${CMAKE_MATCH_2}) #set(var_value ${CMAKE_MATCH_2})
string(STRIP ${CMAKE_MATCH_2} var_value) string(STRIP ${CMAKE_MATCH_2} var_value)
# check for Makefile variables in the string, e.g. $(TSUFFIX) # check for Makefile variables in the string, e.g. $(TSUFFIX)
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
@ -54,39 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN)
string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value})
endforeach () endforeach ()
set(${var_name} ${var_value}) set(${var_name} ${var_value})
else () continue ()
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") endif ()
if (NOT "${line_match}" STREQUAL "") # Include a new file to be parsed
#message(STATUS "match on include ${line_match}") string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) if (NOT "${line_match}" STREQUAL "")
else () #message(STATUS "match on include ${line_match}")
# message(STATUS "unmatched line ${line_match}") ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") continue ()
if (NOT "${line_match}" STREQUAL "") endif ()
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") # The if statement that precedes this else has the path taken
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) # Thus, this else statement has to be skipped.
# message (STATUS "condition is true") string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
set (IfElse 1) if (NOT "${line_match}" STREQUAL "")
else () #message(STATUS "skipping ${makefile_line}")
set (IfElse 2) set (SkipElse 1)
endif () continue()
endif()
# Example 1: ifdef HAVE_MSA
# Example 2: ifndef ZNRM2KERNEL
string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}")
set (ElseSeen 0)
if (DEFINED ${CMAKE_MATCH_2})
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
#message (STATUS "condition is true")
set (IfElse 1)
else () else ()
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") set (IfElse 2)
if (NOT "${line_match}" STREQUAL "") endif ()
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") else ()
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) if (${CMAKE_MATCH_1} STREQUAL "ifdef")
set (CMAKE_MATCH_1 CMAKE_C_COMPILER) set (IfElse 2)
endif () else ()
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) #message (STATUS "condition is true")
# message (STATUS "condition is true") set (IfElse 1)
set (IfElse 1)
else ()
set (IfElse 2)
endif ()
endif ()
endif () endif ()
endif () endif ()
continue ()
endif () endif ()
# Example 1: ifeq ($(SGEMM_UNROLL_M), 16)
# Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
# Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
# Ignore the second group since (?:...) does not work on cmake
string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}")
if (DEFINED ${CMAKE_MATCH_1})
if (DEFINED ${CMAKE_MATCH_4})
set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}})
else ()
set (STR ${${CMAKE_MATCH_1}})
endif ()
if (${STR} STREQUAL ${CMAKE_MATCH_5})
#message (STATUS "condition is true")
set (IfElse 1)
continue ()
endif ()
endif ()
set (IfElse 2)
continue ()
endif ()
# Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
# Example 2 (Group 4): ifneq ($(C_COMPILER), PGI)
string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}")
set (ElseSeen 0)
set (HasValidGroup 0)
if (DEFINED ${CMAKE_MATCH_3})
set (HasValidGroup 1)
set (STR ${${CMAKE_MATCH_3}})
elseif (NOT ${CMAKE_MATCH_4} STREQUAL "")
set (HasValidGroup 1)
set (STR ${CMAKE_MATCH_4})
endif ()
if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1)
if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR}))
#message (STATUS "condition is true")
set (IfElse 1)
continue ()
endif ()
endif ()
set (IfElse 2)
continue ()
endif ()
#message(STATUS "unmatched line ${line_match}")
endforeach () endforeach ()
endmacro () endmacro ()

View File

@ -1,13 +1,14 @@
include ../Makefile.rule TOPDIR = ..
include $(TOPDIR)/Makefile.system
all :: dgemv_tester dgemm_tester all :: dgemv_tester dgemm_tester
dgemv_tester : dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
./dgemv_tester ./dgemv_tester
dgemm_tester : dgemv_tester dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
./dgemm_tester ./dgemm_tester
clean :: clean ::

View File

@ -120,6 +120,7 @@
#define CORE_SKYLAKEX 28 #define CORE_SKYLAKEX 28
#define CORE_DHYANA 29 #define CORE_DHYANA 29
#define CORE_COOPERLAKE 30 #define CORE_COOPERLAKE 30
#define CORE_SAPPHIRERAPIDS 31
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -145,6 +146,7 @@
#define HAVE_AVX512VL (1 << 21) #define HAVE_AVX512VL (1 << 21)
#define HAVE_AVX2 (1 << 22) #define HAVE_AVX2 (1 << 22)
#define HAVE_AVX512BF16 (1 << 23) #define HAVE_AVX512BF16 (1 << 23)
#define HAVE_AMXBF16 (1 << 24)
#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@ -222,6 +224,7 @@ typedef struct {
#define CPUTYPE_SKYLAKEX 52 #define CPUTYPE_SKYLAKEX 52
#define CPUTYPE_DHYANA 53 #define CPUTYPE_DHYANA 53
#define CPUTYPE_COOPERLAKE 54 #define CPUTYPE_COOPERLAKE 54
#define CPUTYPE_SAPPHIRERAPIDS 55
#define CPUTYPE_HYGON_UNKNOWN 99 #define CPUTYPE_HYGON_UNKNOWN 99

View File

@ -26,10 +26,12 @@
*****************************************************************************/ *****************************************************************************/
#include <string.h> #include <string.h>
#ifdef OS_DARWIN #ifdef __APPLE__
#include <sys/sysctl.h> #include <sys/sysctl.h>
int32_t value; int32_t value;
size_t length=sizeof(value); size_t length=sizeof(value);
int64_t value64;
size_t length64=sizeof(value64);
#endif #endif
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
@ -53,6 +55,8 @@ size_t length=sizeof(value);
#define CPU_EMAG8180 10 #define CPU_EMAG8180 10
// Apple // Apple
#define CPU_VORTEX 13 #define CPU_VORTEX 13
// Fujitsu
#define CPU_A64FX 15
static char *cpuname[] = { static char *cpuname[] = {
"UNKNOWN", "UNKNOWN",
@ -69,7 +73,8 @@ static char *cpuname[] = {
"NEOVERSEN1", "NEOVERSEN1",
"THUNDERX3T110", "THUNDERX3T110",
"VORTEX", "VORTEX",
"CORTEXA55" "CORTEXA55",
"A64FX"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
@ -87,7 +92,8 @@ static char *cpuname_lower[] = {
"neoversen1", "neoversen1",
"thunderx3t110", "thunderx3t110",
"vortex", "vortex",
"cortexa55" "cortexa55",
"a64fx"
}; };
int get_feature(char *search) int get_feature(char *search)
@ -183,6 +189,9 @@ int detect(void)
// Ampere // Ampere
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
return CPU_EMAG8180; return CPU_EMAG8180;
// Fujitsu
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
return CPU_A64FX;
} }
p = (char *) NULL ; p = (char *) NULL ;
@ -212,9 +221,9 @@ int detect(void)
} }
#else #else
#ifdef DARWIN #ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value,&length,NULL,0); sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
if (value ==131287967) return CPU_VORTEX; if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
#endif #endif
return CPU_ARMV8; return CPU_ARMV8;
#endif #endif
@ -265,7 +274,7 @@ int n=0;
printf("#define NUM_CORES %d\n",n); printf("#define NUM_CORES %d\n",n);
#endif #endif
#ifdef DARWIN #ifdef __APPLE__
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
printf("#define NUM_CORES %d\n",value); printf("#define NUM_CORES %d\n",value);
#endif #endif
@ -285,154 +294,166 @@ void get_cpuconfig(void)
switch (d) switch (d)
{ {
case CPU_CORTEXA53: case CPU_CORTEXA53:
case CPU_CORTEXA55: case CPU_CORTEXA55:
printf("#define %s\n", cpuname[d]); printf("#define %s\n", cpuname[d]);
// Fall-through // Fall-through
case CPU_ARMV8: case CPU_ARMV8:
// Minimum parameters for ARMv8 (based on A53) // Minimum parameters for ARMv8 (based on A53)
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n"); printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n"); printf("#define L2_ASSOCIATIVE 4\n");
break; break;
case CPU_CORTEXA57: case CPU_CORTEXA57:
case CPU_CORTEXA72: case CPU_CORTEXA72:
case CPU_CORTEXA73: case CPU_CORTEXA73:
// Common minimum settings for these Arm cores // Common minimum settings for these Arm cores
// Can change a lot, but we need to be conservative // Can change a lot, but we need to be conservative
// TODO: detect info from /sys if possible // TODO: detect info from /sys if possible
printf("#define %s\n", cpuname[d]); printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n"); printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n"); printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n"); printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 524288\n"); printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n"); printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
break; break;
case CPU_NEOVERSEN1: case CPU_NEOVERSEN1:
printf("#define %s\n", cpuname[d]); printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n"); printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n"); printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n"); printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n"); printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
break; break;
case CPU_FALKOR: case CPU_FALKOR:
printf("#define FALKOR\n"); printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n"); printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 524288\n"); printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n"); printf("#define L2_ASSOCIATIVE 16\n");
break; break;
case CPU_THUNDERX: case CPU_THUNDERX:
printf("#define THUNDERX\n"); printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n"); printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 16777216\n"); printf("#define L2_SIZE 16777216\n");
printf("#define L2_LINESIZE 128\n"); printf("#define L2_LINESIZE 128\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n"); printf("#define L2_ASSOCIATIVE 16\n");
break; break;
case CPU_THUNDERX2T99: case CPU_THUNDERX2T99:
printf("#define THUNDERX2T99 \n"); printf("#define THUNDERX2T99 \n");
printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n"); printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n"); printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n"); printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n"); printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n"); printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n"); printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n"); printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n"); printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n"); printf("#define DTB_SIZE 4096 \n");
break; break;
case CPU_TSV110: case CPU_TSV110:
printf("#define TSV110 \n"); printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n"); printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n"); printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n"); printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n"); printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n"); printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n"); printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n"); printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n"); printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n"); printf("#define DTB_SIZE 4096 \n");
break; break;
case CPU_EMAG8180: case CPU_EMAG8180:
// Minimum parameters for ARMv8 (based on A53) // Minimum parameters for ARMv8 (based on A53)
printf("#define EMAG8180\n"); printf("#define EMAG8180\n");
printf("#define L1_CODE_SIZE 32768\n"); printf("#define L1_CODE_SIZE 32768\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n"); printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
break; break;
case CPU_THUNDERX3T110: case CPU_THUNDERX3T110:
printf("#define THUNDERX3T110 \n"); printf("#define THUNDERX3T110 \n");
printf("#define L1_CODE_SIZE 65536 \n"); printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n"); printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n"); printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 524288 \n"); printf("#define L2_SIZE 524288 \n");
printf("#define L2_LINESIZE 64 \n"); printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n"); printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 94371840 \n"); printf("#define L3_SIZE 94371840 \n");
printf("#define L3_LINESIZE 64 \n"); printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n"); printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n"); printf("#define DTB_SIZE 4096 \n");
break; break;
#ifdef DARWIN #ifdef __APPLE__
case CPU_VORTEX: case CPU_VORTEX:
printf("#define VORTEX \n"); printf("#define VORTEX \n");
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %d \n",value); printf("#define L1_CODE_SIZE %lld \n",value64);
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_LINESIZE %d \n",value); printf("#define L1_CODE_LINESIZE %lld \n",value64);
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
printf("#define L1_DATA_SIZE %d \n",value); printf("#define L1_DATA_SIZE %lld \n",value64);
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %d \n",value); printf("#define L2_SIZE %lld \n",value64);
break; printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#endif #endif
case CPU_A64FX:
printf("#define A64FX\n");
printf("#define L1_CODE_SIZE 65535\n");
printf("#define L1_DATA_SIZE 65535\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L2_SIZE 8388608\n");
printf("#define L2_LINESIZE 256\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
} }
get_cpucount(); get_cpucount();
} }

View File

@ -165,6 +165,7 @@ void get_cpuconfig(void){
}else{ }else{
printf("#define UNKNOWN\n"); printf("#define UNKNOWN\n");
} }
if (!get_feature(msa)) printf("#define NO_MSA\n");
} }
void get_libname(void){ void get_libname(void){
@ -178,3 +179,38 @@ void get_libname(void){
printf("mips\n"); printf("mips\n");
} }
} }
int get_feature(char *search)
{
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, search)) { return(1); }
}
#endif
return(0);
}

View File

@ -104,17 +104,17 @@ int detect(void){
} }
} }
fclose(infile); fclose(infile);
if(p != NULL){ if (p != NULL){
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
return CPU_LOONGSON3R3; return CPU_LOONGSON3R3;
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ } else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
return CPU_LOONGSON3R4; return CPU_LOONGSON3R4;
} else{ } else{
return CPU_SICORTEX; return CPU_SICORTEX;
}
} }
#endif #endif
return CPU_UNKNOWN; return CPU_UNKNOWN;
}
} }
char *get_corename(void){ char *get_corename(void){
@ -201,6 +201,7 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n"); printf("#define L2_ASSOCIATIVE 8\n");
} }
if (!get_feature(msa)) printf("#define NO_MSA\n");
} }
void get_libname(void){ void get_libname(void){
@ -218,3 +219,38 @@ void get_libname(void){
printf("mips64\n"); printf("mips64\n");
} }
} }
int get_feature(char *search)
{
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, search)) { return(1); }
}
#endif
return(0);
}

View File

@ -1,3 +1,4 @@
//{
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -266,6 +267,31 @@ int support_avx512_bf16(){
#endif #endif
} }
#define BIT_AMX_TILE 0x01000000
#define BIT_AMX_BF16 0x00400000
#define BIT_AMX_ENBD 0x00060000
int support_amx_bf16() {
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx512())
return 0;
// CPUID.7.0:EDX indicates AMX support
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
// CPUID.D.0:EAX[17:18] indicates AMX enabled
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
ret = 1;
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){ int get_vendor(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
char vendor[13]; char vendor[13];
@ -353,6 +379,7 @@ int get_cputype(int gettype){
if (support_avx2()) feature |= HAVE_AVX2; if (support_avx2()) feature |= HAVE_AVX2;
if (support_avx512()) feature |= HAVE_AVX512VL; if (support_avx512()) feature |= HAVE_AVX512VL;
if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; if (support_avx512_bf16()) feature |= HAVE_AVX512BF16;
if (support_amx_bf16()) feature |= HAVE_AMXBF16;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif #endif
@ -1429,10 +1456,10 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 9:
case 8: case 8:
switch (model) { switch (model) {
case 12: // Tiger Lake case 12: // Tiger Lake
case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz)
if(support_avx512()) if(support_avx512())
return CPUTYPE_SKYLAKEX; return CPUTYPE_SKYLAKEX;
if(support_avx2()) if(support_avx2())
@ -1448,19 +1475,10 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} case 15: // Sapphire Rapids
case 10: //family 6 exmodel 10 if(support_avx512_bf16())
switch (model) { return CPUTYPE_COOPERLAKE;
case 5: // Comet Lake H and S if(support_avx512())
case 6: // Comet Lake U
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 7: // Rocket Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX; return CPUTYPE_SKYLAKEX;
if(support_avx2()) if(support_avx2())
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
@ -1468,8 +1486,57 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 9:
switch (model) {
case 7: // Alder Lake desktop
case 10: // Alder Lake mobile
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 13: // Ice Lake NNPI
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14: // Kaby Lake and refreshes
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 10: //family 6 exmodel 10
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 7: // Rocket Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
} }
break; break;
case 0x7: case 0x7:
@ -2042,32 +2109,7 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
} }
break; break;
case 10:
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())
return CORE_SKYLAKEX;
#endif
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;
#endif
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
case 5: case 5:
switch (model) { switch (model) {
case 6: case 6:
@ -2121,6 +2163,7 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
} }
break; break;
case 6: case 6:
if (model == 6) if (model == 6)
#ifndef NO_AVX512 #ifndef NO_AVX512
@ -2135,7 +2178,7 @@ int get_coretype(void){
else else
return CORE_NEHALEM; return CORE_NEHALEM;
#endif #endif
if (model == 10) if (model == 10 || model == 12)
#ifndef NO_AVX512 #ifndef NO_AVX512
if(support_avx512_bf16()) if(support_avx512_bf16())
return CORE_COOPERLAKE; return CORE_COOPERLAKE;
@ -2151,10 +2194,11 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
#endif #endif
break; break;
case 7: case 7:
if (model == 10) if (model == 10)
return CORE_NEHALEM; return CORE_NEHALEM;
if (model == 14) if (model == 13 || model == 14) // Ice Lake
#ifndef NO_AVX512 #ifndef NO_AVX512
return CORE_SKYLAKEX; return CORE_SKYLAKEX;
#else #else
@ -2168,9 +2212,9 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
#endif #endif
break; break;
case 9:
case 8: case 8:
if (model == 12) { // Tiger Lake if (model == 12 || model == 13) { // Tiger Lake
if(support_avx512()) if(support_avx512())
return CORE_SKYLAKEX; return CORE_SKYLAKEX;
if(support_avx2()) if(support_avx2())
@ -2180,7 +2224,7 @@ int get_coretype(void){
else else
return CORE_NEHALEM; return CORE_NEHALEM;
} }
if (model == 14) { // Kaby Lake if (model == 14) { // Kaby Lake mobile
if(support_avx()) if(support_avx())
#ifndef NO_AVX2 #ifndef NO_AVX2
return CORE_HASWELL; return CORE_HASWELL;
@ -2190,12 +2234,82 @@ int get_coretype(void){
else else
return CORE_NEHALEM; return CORE_NEHALEM;
} }
} if (model == 15) { // Sapphire Rapids
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break; break;
case 9:
if (model == 7 || model == 10) { // Alder Lake
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
if (model == 13) { // Ice Lake NNPI
if(support_avx512())
return CORE_SKYLAKEX;
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
if (model == 14) { // Kaby Lake desktop
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
}
break;
case 10:
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())
return CORE_SKYLAKEX;
#endif
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;
#endif
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
case 15: case 15:
if (model <= 0x2) return CORE_NORTHWOOD; if (model <= 0x2) return CORE_NORTHWOOD;
else return CORE_PRESCOTT; else return CORE_PRESCOTT;
}
} }
} }
@ -2389,6 +2503,7 @@ void get_cpuconfig(void){
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n");
if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
@ -2460,9 +2575,11 @@ void get_sse(void){
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n");
if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
} }
//}

View File

@ -27,57 +27,11 @@
#include <string.h> #include <string.h>
#define CPU_GENERIC 0 #include "cpuid_zarch.h"
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_Z15 3
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13",
"Z14",
"Z15"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13",
"z14",
"z15"
};
int detect(void)
{
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/sysinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Type", buffer, 4)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
return CPU_GENERIC;
}
void get_libname(void) void get_libname(void)
{ {
int d = detect(); int d = detect();
printf("%s", cpuname_lower[d]); printf("%s", cpuname_lower[d]);
} }

101
cpuid_zarch.h Normal file
View File

@ -0,0 +1,101 @@
#include <stdlib.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_Z15 3
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13",
"Z14",
"Z15"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13",
"z14",
"z15"
};
// Guard the use of getauxval() on glibc version >= 2.16
#ifdef __GLIBC__
#include <features.h>
#if __GLIBC_PREREQ(2, 16)
#include <sys/auxv.h>
#define HAVE_GETAUXVAL 1
static unsigned long get_hwcap(void)
{
unsigned long hwcap = getauxval(AT_HWCAP);
char *maskenv;
// honor requests for not using specific CPU features in LD_HWCAP_MASK
maskenv = getenv("LD_HWCAP_MASK");
if (maskenv)
hwcap &= strtoul(maskenv, NULL, 0);
return hwcap;
// note that a missing auxval is interpreted as no capabilities
// available, which is safe.
}
#else // __GLIBC_PREREQ(2, 16)
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
static unsigned long get_hwcap(void) {
// treat missing support for getauxval() as no capabilities available,
// which is safe.
return 0;
}
#endif // __GLIBC_PREREQ(2, 16)
#endif // __GLIBC
static int detect(void)
{
unsigned long hwcap = get_hwcap();
// Choose the architecture level for optimized kernels based on hardware
// capability bits (just like glibc chooses optimized implementations).
//
// The hardware capability bits that are used here indicate both
// hardware support for a particular ISA extension and the presence of
// software support to enable its use. For example, when HWCAP_S390_VX
// is set then both the CPU can execute SIMD instructions and the Linux
// kernel can manage applications using the vector registers and SIMD
// instructions.
//
// See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in
// sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware
// capability bits. They are derived from the information that the
// "store facility list (extended)" instructions provide.
// (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD)
//
// currently used:
// HWCAP_S390_VX - vector facility for z/Architecture (introduced with
// IBM z13), enables level CPU_Z13 (SIMD)
// HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM
// z14), together with VX enables level CPU_Z14
// (single-precision SIMD instructions)
//
// When you add optimized kernels that make use of other ISA extensions
// (e.g., for exploiting the vector-enhancements facility 2 that was introduced
// with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate
// it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2
// for the z15 vector enhancements).
//
// To learn the value of hwcaps on a given system, set the environment
// variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running
// LD_SHOW_AUXV=1 /bin/true).
// Also, the init function for dynamic arch support will print hwcaps
// when OPENBLAS_VERBOSE is set to 2 or higher.
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
return CPU_Z14;
if (hwcap & HWCAP_S390_VX)
return CPU_Z13;
return CPU_GENERIC;
}

View File

@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else #else
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else

View File

@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Split local region of B into parts */ /* Split local region of B into parts */
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
min_jj = MIN(n_to, js + div_n) - jjs; min_jj = MIN(n_to, js + div_n) - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else

View File

@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else

View File

@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < ls - js; jjs += min_jj){ for(jjs = 0; jjs < ls - js; jjs += min_jj){
min_jj = ls - js - jjs; min_jj = ls - js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){ for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs; min_jj = min_l - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){ for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs; min_jj = min_l - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
min_jj = js - ls - min_l - jjs; min_jj = js - ls - min_l - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(SKYLAKEX) || defined(COOPERLAKE) #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else

View File

@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 ""
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (ARM64) if (ARM64)
list(APPEND COMMON_SOURCES dynamic_arm64.c) list(APPEND COMMON_SOURCES dynamic_arm64.c)
elseif (POWER)
list(APPEND COMMON_SOURCES dynamic_power.c)
else () else ()
list(APPEND COMMON_SOURCES dynamic.c) list(APPEND COMMON_SOURCES dynamic.c)
endif () endif ()

View File

@ -40,7 +40,7 @@
#include <stdlib.h> #include <stdlib.h>
#include "common.h" #include "common.h"
#if defined(OS_CYGWIN_NT) && !defined(unlikely) #if !defined(unlikely)
#ifdef __GNUC__ #ifdef __GNUC__
#define unlikely(x) __builtin_expect(!!(x), 0) #define unlikely(x) __builtin_expect(!!(x), 0)
#else #else
@ -391,8 +391,9 @@ int blas_thread_init(void){
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) #if defined(SMP_SERVER)
// Handle lazy re-init of the thread-pool after a POSIX fork // Handle lazy re-init of the thread-pool after a POSIX fork
// on Cygwin or as delayed init when a static library is used
if (unlikely(blas_server_avail == 0)) blas_thread_init(); if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif #endif

View File

@ -624,7 +624,7 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; return &gotoblas_NEHALEM;
} }
} }
if (model == 10) { if (model == 10 || model == 12){
// Ice Lake SP // Ice Lake SP
if(support_avx512_bf16()) if(support_avx512_bf16())
return &gotoblas_COOPERLAKE; return &gotoblas_COOPERLAKE;
@ -644,7 +644,7 @@ static gotoblas_t *get_coretype(void){
case 7: case 7:
if (model == 10) // Goldmont Plus if (model == 10) // Goldmont Plus
return &gotoblas_NEHALEM; return &gotoblas_NEHALEM;
if (model == 14) { if (model == 13 || model == 14) {
// Ice Lake // Ice Lake
if (support_avx512()) if (support_avx512())
return &gotoblas_SKYLAKEX; return &gotoblas_SKYLAKEX;
@ -661,9 +661,8 @@ static gotoblas_t *get_coretype(void){
} }
} }
return NULL; return NULL;
case 9:
case 8: case 8:
if (model == 12) { // Tiger Lake if (model == 12 || model == 13) { // Tiger Lake
if (support_avx512()) if (support_avx512())
return &gotoblas_SKYLAKEX; return &gotoblas_SKYLAKEX;
if(support_avx2()){ if(support_avx2()){
@ -689,6 +688,50 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
if (model == 15){ // Sapphire Rapids
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
if (model == 7 || model == 10) { // Alder Lake
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 10: case 10:
if (model == 5 || model == 6) { if (model == 5 || model == 6) {
if(support_avx2()) if(support_avx2())
@ -1018,7 +1061,13 @@ void gotoblas_dynamic_init(void) {
#ifdef ARCH_X86 #ifdef ARCH_X86
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else #else
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; if (gotoblas == NULL) {
if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE;
else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX;
else if (support_avx2()) gotoblas = &gotoblas_HASWELL;
else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE;
else gotoblas = &gotoblas_PRESCOTT;
}
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */ /* sanity check, if 64bit pointer we can't have a 32 bit cpu */
if (sizeof(void*) == 8) { if (sizeof(void*) == 8) {
if (gotoblas == &gotoblas_KATMAI || if (gotoblas == &gotoblas_KATMAI ||

View File

@ -1,38 +1,7 @@
#include "common.h" #include "common.h"
#include "cpuid_zarch.h"
#include <stdbool.h> #include <stdbool.h>
// Guard the use of getauxval() on glibc version >= 2.16
#ifdef __GLIBC__
#include <features.h>
#if __GLIBC_PREREQ(2, 16)
#include <sys/auxv.h>
#define HAVE_GETAUXVAL 1
static unsigned long get_hwcap(void)
{
unsigned long hwcap = getauxval(AT_HWCAP);
char *maskenv;
// honor requests for not using specific CPU features in LD_HWCAP_MASK
maskenv = getenv("LD_HWCAP_MASK");
if (maskenv)
hwcap &= strtoul(maskenv, NULL, 0);
return hwcap;
// note that a missing auxval is interpreted as no capabilities
// available, which is safe.
}
#else // __GLIBC_PREREQ(2, 16)
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
static unsigned long get_hwcap(void) {
// treat missing support for getauxval() as no capabilities available,
// which is safe.
return 0;
}
#endif // __GLIBC_PREREQ(2, 16)
#endif // __GLIBC
extern gotoblas_t gotoblas_ZARCH_GENERIC; extern gotoblas_t gotoblas_ZARCH_GENERIC;
#ifdef DYN_Z13 #ifdef DYN_Z13
@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14;
#define NUM_CORETYPES 4 #define NUM_CORETYPES 4
extern int openblas_verbose();
extern void openblas_warning(int verbose, const char* msg); extern void openblas_warning(int verbose, const char* msg);
static char* corename[] = {
"unknown",
"Z13",
"Z14",
"ZARCH_GENERIC",
};
char* gotoblas_corename(void) { char* gotoblas_corename(void) {
#ifdef DYN_Z13 #ifdef DYN_Z13
if (gotoblas == &gotoblas_Z13) return corename[1]; if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13];
#endif #endif
#ifdef DYN_Z14 #ifdef DYN_Z14
if (gotoblas == &gotoblas_Z14) return corename[2]; if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14];
#endif #endif
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC];
return corename[0]; return "unknown";
} }
#ifndef HWCAP_S390_VXE #ifndef HWCAP_S390_VXE
@ -79,25 +42,28 @@ char* gotoblas_corename(void) {
*/ */
static gotoblas_t* get_coretype(void) { static gotoblas_t* get_coretype(void) {
unsigned long hwcap __attribute__((unused)) = get_hwcap(); int cpu = detect();
#ifdef DYN_Z14 switch(cpu) {
// z14 and z15 systems: exploit Vector Facility (SIMD) and // z14 and z15 systems: exploit Vector Facility (SIMD) and
// Vector-Enhancements Facility 1 (float SIMD instructions), if present. // Vector-Enhancements Facility 1 (float SIMD instructions), if present.
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) case CPU_Z14:
#ifdef DYN_Z14
return &gotoblas_Z14; return &gotoblas_Z14;
#endif #endif
#ifdef DYN_Z13
// z13: Vector Facility (SIMD for double) // z13: Vector Facility (SIMD for double)
if (hwcap & HWCAP_S390_VX) case CPU_Z13:
#ifdef DYN_Z13
return &gotoblas_Z13; return &gotoblas_Z13;
#endif #endif
default:
// fallback in case of missing compiler support, systems before z13, or // fallback in case of missing compiler support, systems before z13, or
// when the OS does not advertise support for the Vector Facility (e.g., // when the OS does not advertise support for the Vector Facility (e.g.,
// missing support in the OS kernel) // missing support in the OS kernel)
return &gotoblas_ZARCH_GENERIC; return &gotoblas_ZARCH_GENERIC;
}
} }
static gotoblas_t* force_coretype(char* coretype) { static gotoblas_t* force_coretype(char* coretype) {
@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) {
for (i = 0; i < NUM_CORETYPES; i++) for (i = 0; i < NUM_CORETYPES; i++)
{ {
if (!strncasecmp(coretype, corename[i], 20)) if (!strncasecmp(coretype, cpuname[i], 20))
{ {
found = i; found = i;
break; break;
} }
} }
if (found == 1) { if (found == CPU_Z13) {
#ifdef DYN_Z13 #ifdef DYN_Z13
return &gotoblas_Z13; return &gotoblas_Z13;
#else #else
openblas_warning(1, "Z13 support not compiled in"); openblas_warning(1, "Z13 support not compiled in");
return NULL; return NULL;
#endif #endif
} else if (found == 2) { } else if (found == CPU_Z14) {
#ifdef DYN_Z14 #ifdef DYN_Z14
return &gotoblas_Z14; return &gotoblas_Z14;
#else #else
openblas_warning(1, "Z14 support not compiled in"); openblas_warning(1, "Z14 support not compiled in");
return NULL; return NULL;
#endif #endif
} else if (found == 3) { } else if (found == CPU_GENERIC) {
return &gotoblas_ZARCH_GENERIC; return &gotoblas_ZARCH_GENERIC;
} }
@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) {
else else
{ {
gotoblas = get_coretype(); gotoblas = get_coretype();
if (openblas_verbose() >= 2) {
snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n",
getauxval(AT_HWCAP));
openblas_warning(2, coremsg);
}
} }
if (gotoblas == NULL) if (gotoblas == NULL)
@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) {
} }
if (gotoblas && gotoblas->init) { if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20); if (openblas_verbose() >= 2) {
sprintf(coremsg, "Core: %s\n", coren); strncpy(coren, gotoblas_corename(), 20);
openblas_warning(2, coremsg); sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
}
gotoblas->init(); gotoblas->init();
} }
else { else {

View File

@ -246,6 +246,14 @@ int get_num_procs(void) {
#endif #endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if defined(USE_OPENMP)
#if _OPENMP >= 201511
nums = omp_get_num_places();
#endif
return nums;
#endif
#if !defined(OS_LINUX) #if !defined(OS_LINUX)
return nums; return nums;
#endif #endif
@ -1806,6 +1814,15 @@ int get_num_procs(void) {
#endif #endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if defined(USE_OPENMP)
/* if (omp_get_proc_bind() != omp_proc_bind_false) */
#if _OPENMP >= 201511
nums = omp_get_num_places();
#endif
return nums;
#endif
#if !defined(OS_LINUX) #if !defined(OS_LINUX)
return nums; return nums;
#endif #endif
@ -2854,32 +2871,28 @@ void *blas_memory_alloc(int procpos){
position ++; position ++;
} while (position < NUM_BUFFERS); } while (position < NUM_BUFFERS);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
if (memory_overflowed) { if (memory_overflowed) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); do {
#endif RMB;
do {
RMB;
#if defined(USE_OPENMP) #if defined(USE_OPENMP)
if (!newmemory[position-NUM_BUFFERS].used) { if (!newmemory[position-NUM_BUFFERS].used) {
blas_lock(&newmemory[position-NUM_BUFFERS].lock); blas_lock(&newmemory[position-NUM_BUFFERS].lock);
#endif #endif
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
#if defined(USE_OPENMP) #if defined(USE_OPENMP)
blas_unlock(&newmemory[position-NUM_BUFFERS].lock); blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
} }
#endif #endif
position ++; position ++;
} while (position < 512+NUM_BUFFERS); } while (position < 512+NUM_BUFFERS);
}
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif #endif
}
goto error; goto error;
allocation : allocation :
@ -2904,7 +2917,7 @@ void *blas_memory_alloc(int procpos){
func = &memoryalloc[0]; func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) { while ((*func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address); map_address = (*func)((void *)base_address);
@ -2984,6 +2997,9 @@ void *blas_memory_alloc(int procpos){
return (void *)memory[position].addr; return (void *)memory[position].addr;
error: error:
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
if (memory_overflowed) goto terminate; if (memory_overflowed) goto terminate;
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
memory_overflowed=1; memory_overflowed=1;
@ -2997,7 +3013,6 @@ void *blas_memory_alloc(int procpos){
newmemory[i].used = 0; newmemory[i].used = 0;
newmemory[i].lock = 0; newmemory[i].lock = 0;
} }
newmemory[position-NUM_BUFFERS].used = 1;
allocation2: allocation2:
newmemory[position-NUM_BUFFERS].used = 1; newmemory[position-NUM_BUFFERS].used = 1;
@ -3015,7 +3030,7 @@ allocation2:
func = &memoryalloc[0]; func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) { while ((*func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address); map_address = (*func)((void *)base_address);
@ -3069,6 +3084,9 @@ allocation2:
return (void *)newmemory[position-NUM_BUFFERS].addr; return (void *)newmemory[position-NUM_BUFFERS].addr;
terminate: terminate:
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");

View File

@ -183,7 +183,7 @@ int get_L2_size(void){
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -269,7 +269,7 @@ void blas_set_parameter(void){
int factor; int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
defined(SKYLAKEX) || defined(COOPERLAKE) defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
int size = 16; int size = 16;
#else #else
int size = get_L2_size(); int size = get_L2_size();

View File

@ -469,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#ifdef FORCE_SAPPHIRERAPIDS
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX512
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
#else
#define SUBARCHITECTURE "SAPPHIRERAPIDS"
#define ARCHCONFIG "-DSAPPHIRERAPIDS " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids"
#define LIBNAME "sapphirerapids"
#define CORENAME "SAPPHIRERAPIDS"
#endif
#endif
#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -964,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DP5600 " \ #define ARCHCONFIG "-DP5600 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
#define LIBNAME "p5600" #define LIBNAME "p5600"
#define CORENAME "P5600" #define CORENAME "P5600"
#else #else
@ -978,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DMIPS1004K " \ #define ARCHCONFIG "-DMIPS1004K " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
#define LIBNAME "mips1004K" #define LIBNAME "mips1004K"
#define CORENAME "MIPS1004K" #define CORENAME "MIPS1004K"
#else #else
@ -992,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DMIPS24K " \ #define ARCHCONFIG "-DMIPS24K " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
#define LIBNAME "mips24K" #define LIBNAME "mips24K"
#define CORENAME "MIPS24K" #define CORENAME "MIPS24K"
#else #else
@ -1149,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#endif #endif
#ifdef FORCE_ARMV8SVE
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8SVE"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DARMV8SVE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "armv8sve"
#define CORENAME "ARMV8SVE"
#endif
#ifdef FORCE_ARMV8 #ifdef FORCE_ARMV8
#define FORCE #define FORCE
@ -1375,6 +1438,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "VORTEX" #define CORENAME "VORTEX"
#endif #endif
#ifdef FORCE_A64FX
#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "A64FX"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DA64FX " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "a64fx"
#define CORENAME "A64FX"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC #ifdef FORCE_ZARCH_GENERIC
#define FORCE #define FORCE
#define ARCHITECTURE "ZARCH" #define ARCHITECTURE "ZARCH"

View File

@ -188,12 +188,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (n == 0) return; if (n == 0) return;
if (incx == 1 && trans == 0 && n < 50) {
buffer = NULL;
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
return;
}
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();

View File

@ -42,14 +42,20 @@
#include "functable.h" #include "functable.h"
#endif #endif
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){
BLASLONG n = *N; BLASLONG n = *N;
BLASLONG incx = *INCX; BLASLONG incx = *INCX;
BLASLONG incy = *INCY; BLASLONG incy = *INCY;
FLOAT c = *C; FLOAT c = *C;
FLOAT s = *S; FLOAT s = *S;
#else
void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) {
FLOAT *x = (FLOAT*) VX;
FLOAT *y = (FLOAT*) VY;
#endif /* CBLAS */
PRINT_DEBUG_NAME; PRINT_DEBUG_NAME;
if (n <= 0) return; if (n <= 0) return;

View File

@ -4,8 +4,16 @@
#include "functable.h" #include "functable.h"
#endif #endif
#ifndef CBLAS
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
#else
void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
FLOAT *DA = (FLOAT*) VDA;
FLOAT *DB = (FLOAT*) VDB;
FLOAT *S = (FLOAT*) VS;
#endif /* CBLAS */
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
long double da_r = *(DA + 0); long double da_r = *(DA + 0);

View File

@ -199,12 +199,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (n == 0) return; if (n == 0) return;
if (incx == 1 && trans == 0 && n < 50) {
buffer = NULL;
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
return;
}
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();

View File

@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
if (${DYNAMIC_ARCH}) if (${DYNAMIC_ARCH})
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
endif () endif ()
ParseMakefileVars("${KERNELDIR}/KERNEL")
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
SetDefaultL1() SetDefaultL1()
SetDefaultL2() SetDefaultL2()
SetDefaultL3() SetDefaultL3()
ParseMakefileVars("${KERNELDIR}/KERNEL")
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h) set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h)
if(NOT NO_LAPACK) if(NOT NO_LAPACK)
@ -198,7 +198,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
# Makefile.L3 # Makefile.L3
set(USE_TRMM false) set(USE_TRMM false)
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS))
set(USE_TRMM true) set(USE_TRMM true)
endif () endif ()
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
# symm for s and d # symm for s and d
if (NOT DEFINED ${float_char}SYMMUCOPY_M)
set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
else ()
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
endif()
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define.
# Could simplify it a bit by pairing up by -UUNIT/-DUNIT. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT.
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
else ()
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
endif ()
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
@ -578,11 +596,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
endif () endif ()
if (BUILD_BFLOAT16) if (BUILD_BFLOAT16)
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
@ -616,11 +634,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
endif () endif ()
endif () endif ()

View File

@ -31,7 +31,22 @@ ifdef NO_AVX2
endif endif
ifdef TARGET_CORE ifdef TARGET_CORE
ifeq ($(TARGET_CORE), COOPERLAKE) ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1)
override CFLAGS += -march=sapphirerapids
else
override CFLAGS += -march=skylake-avx512 -mavx512f
endif
ifeq ($(OSNAME), CYGWIN_NT)
override CFLAGS += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
override CFLAGS += -fno-asynchronous-unwind-tables
endif
endif
else ifeq ($(TARGET_CORE), COOPERLAKE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCVERSIONGTEQ10), 1)
override CFLAGS += -march=cooperlake override CFLAGS += -march=cooperlake

View File

@ -47,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE)
USE_TRMM = 1 USE_TRMM = 1
endif endif
ifeq ($(CORE), SAPPHIRERAPIDS)
USE_TRMM = 1
endif
ifeq ($(CORE), ZEN) ifeq ($(CORE), ZEN)
USE_TRMM = 1 USE_TRMM = 1
endif endif
@ -1479,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT
$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
ifdef STRMMUNCOPY_M
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef STRMMLNCOPY_M
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
ifdef STRMMUTCOPY_M
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef STRMMLTCOPY_M
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@ -1527,29 +1563,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N
$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
ifdef DTRMMUNCOPY_M
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef DTRMMLNCOPY_M
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
ifdef DTRMMUTCOPY_M
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef DTRMMLTCOPY_M
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@ -1773,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).
$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
ifdef SSYMMUCOPY_M
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
else
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
endif
ifdef SSYMMLCOPY_M
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
else
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
endif
$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
@ -1785,11 +1863,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).
$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
ifdef DSYMMUCOPY_M
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
else
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
endif
ifdef DSYMMLCOPY_M
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
else
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
endif
$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@

183
kernel/arm64/KERNEL.A64FX Normal file
View File

@ -0,0 +1,183 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -0,0 +1,183 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
@ -169,7 +169,7 @@ endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c

View File

@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
@ -169,7 +169,7 @@ endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c

View File

@ -1 +1 @@
include $(KERNELDIR)/KERNEL.ARMV8 include $(KERNELDIR)/KERNEL.NEOVERSEN1

View File

@ -0,0 +1,898 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmla "
#define FMLA_II "fmls "
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define FMLA_RI "fmls "
#define FMLA_IR "fmla "
#define FMLA_II "fmla "
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmls "
#define FMLA_II "fmla "
#else
#define FMLA_RI "fmls "
#define FMLA_IR "fmls "
#define FMLA_II "fmls "
#endif
#define FMLA_RR "fmla "
static inline void store_m8n1_contracted(float *C,
float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i,
float alphar, float alphai) {
float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8);
ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar);
ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar);
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai);
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai);
ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai);
ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai);
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar);
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar);
vst2q_f32(C, ld1);
vst2q_f32(C + 8, ld2);
}
static inline void kernel_8x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
const float *c_pref = C;
float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i;
float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i;
/** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */
/** v0-v1 and v10-v11 for B, v2-v9 for A */
__asm__ __volatile__(
"cmp %[K],#0; mov %[c_pref],%[C]\n\t"
"movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
"movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
"movi %[c6i].16b,#0\n\t"
"movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t"
"movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
"beq 4f\n\t"
"cmp %[K],#2\n\t"
"ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t"
"ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t"
"mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t"
"bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t"
"bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t"
"blt 3f; beq 2f\n\t"
"1:\n\t"
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t"
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
"fmov v5.d[1],x0; fmov d1,x2\n\t"
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t"
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t"
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t"
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
"fmov v7.d[1],x0; fmov d10,x5\n\t"
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t"
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
"fmov v10.d[1],x6; fmov d11,x2\n\t"
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t"
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t"
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t"
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t"
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t"
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
"fmov v9.d[1],x0; fmov d0,x5\n\t"
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t"
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t"
FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
"fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t"
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t"
FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t"
"fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t"
FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t"
FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t"
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t"
FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t"
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t"
FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t"
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t"
FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t"
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t"
FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
"bgt 1b; blt 3f\n\t"
"2:\n\t"
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t"
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
"fmov v5.d[1],x0; fmov d1,x2\n\t"
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t"
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t"
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t"
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
"fmov v7.d[1],x0; fmov d10,x5\n\t"
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t"
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
"fmov v10.d[1],x6; fmov d11,x2\n\t"
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t"
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t"
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t"
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t"
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
"fmov v9.d[1],x0\n\t"
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t"
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t"
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t"
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t"
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
"b 4f\n\t"
"3:\n\t"
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t"
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t"
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
"fmov v5.d[1],x0; fmov d1,x2\n\t"
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t"
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t"
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
"fmov v1.d[1],x4\n\t"
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t"
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t"
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t"
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t"
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t"
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t"
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t"
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
"4:\n\t"
"mov %[c_pref],%[C]\n\t"
"zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
"zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t"
"zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
"zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
"zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t"
"zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t"
"zip2 %[c2i].2d,v4.2d,v5.2d\n\t"
"zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t"
"zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t"
"zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t"
"zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t"
"zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t"
"zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t"
"zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t"
"zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t"
"zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t"
"zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t"
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref)
:[C]"r"(C), [LDC]"r"(LDC)
:"cc","memory","x0","x1","x2","x3","x4","x5","x6",
"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11");
store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2;
store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2;
store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2;
store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai);
}
static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc,
float32x4_t a, float32x4_t b) {
acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0);
acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1);
acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2);
acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3);
return acc;
}
static inline float32x4x4_t expand_alpha(float alphar, float alphai) {
float32x4x4_t ret;
const float maskp[] = { -1, 1, -1, 1 };
const float maskn[] = { 1, -1, 1, -1 };
const float32x4_t vrevp = vld1q_f32(maskp);
const float32x4_t vrevn = vld1q_f32(maskn);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
ret.val[0] = vdupq_n_f32(alphar);
ret.val[1] = vdupq_n_f32(-alphai);
ret.val[2] = vmulq_f32(ret.val[1], vrevn);
ret.val[3] = vmulq_f32(ret.val[0], vrevp);
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
ret.val[0] = vdupq_n_f32(alphar);
ret.val[1] = vdupq_n_f32(alphai);
ret.val[2] = vmulq_f32(ret.val[1], vrevp);
ret.val[3] = vmulq_f32(ret.val[0], vrevn);
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
ret.val[2] = vdupq_n_f32(alphai);
ret.val[3] = vdupq_n_f32(alphar);
ret.val[0] = vmulq_f32(ret.val[3], vrevn);
ret.val[1] = vmulq_f32(ret.val[2], vrevp);
#else
ret.val[2] = vdupq_n_f32(alphai);
ret.val[3] = vdupq_n_f32(-alphar);
ret.val[0] = vmulq_f32(ret.val[3], vrevp);
ret.val[1] = vmulq_f32(ret.val[2], vrevn);
#endif
return ret;
}
static inline void store_expanded_m2n2(float *C, BLASLONG LDC,
float32x4x4_t acc, float32x4x4_t expanded_alpha) {
float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]);
acc.val[0] = vrev64q_f32(acc.val[0]);
acc.val[2] = vrev64q_f32(acc.val[2]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]);
acc.val[1] = vrev64q_f32(acc.val[1]);
acc.val[3] = vrev64q_f32(acc.val[3]);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]);
vst1q_f32(C, ld1);
vst1q_f32(C + LDC * 2, ld2);
}
static inline float32x4x4_t init_expanded_m2n2() {
float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0),
vdupq_n_f32(0), vdupq_n_f32(0) }};
return ret;
}
static inline void kernel_4x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4),
b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a1, b2);
c4 = acc_expanded_m2n2(c4, a2, b2);
c1 = acc_expanded_m2n2(c1, a3, b3);
c2 = acc_expanded_m2n2(c2, a4, b3);
c3 = acc_expanded_m2n2(c3, a3, b4);
c4 = acc_expanded_m2n2(c4, a4, b4);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a1, b2);
c4 = acc_expanded_m2n2(c4, a2, b2);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
C += LDC * 4;
store_expanded_m2n2(C, LDC, c3, e_alpha);
store_expanded_m2n2(C + 4, LDC, c4, e_alpha);
}
static inline void kernel_8x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20);
float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a3, b1);
c4 = acc_expanded_m2n2(c4, a4, b1);
c1 = acc_expanded_m2n2(c1, a5, b2);
c2 = acc_expanded_m2n2(c2, a6, b2);
c3 = acc_expanded_m2n2(c3, a7, b2);
c4 = acc_expanded_m2n2(c4, a8, b2);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
float32x4_t b1 = vld1q_f32(sb);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c3 = acc_expanded_m2n2(c3, a3, b1);
c4 = acc_expanded_m2n2(c4, a4, b1);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
store_expanded_m2n2(C + 8, LDC, c3, e_alpha);
store_expanded_m2n2(C + 12, LDC, c4, e_alpha);
}
static inline void kernel_4x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2;
c1 = c2 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
c1 = acc_expanded_m2n2(c1, a3, b2);
c2 = acc_expanded_m2n2(c2, a4, b2);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x4_t b1 = vld1q_f32(sb);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b1);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
}
static inline void kernel_2x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2;
c1 = c2 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a1, b2);
c1 = acc_expanded_m2n2(c1, a2, b3);
c2 = acc_expanded_m2n2(c2, a2, b4);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa);
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a1, b2);
}
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n2(C, LDC, c1, e_alpha);
store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha);
}
static inline void kernel_2x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x4x4_t c1, c2;
c1 = c2 = init_expanded_m2n2();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
c1 = acc_expanded_m2n2(c1, a1, b1);
c2 = acc_expanded_m2n2(c2, a2, b2);
}
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]);
c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]);
if (K) {
float32x4_t a1 = vld1q_f32(sa);
float32x4_t b1 = vld1q_f32(sb);
c1 = acc_expanded_m2n2(c1, a1, b1);
}
store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai));
}
static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc,
float32x4_t a, float32x2_t b) {
acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0);
acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1);
return acc;
}
static inline void store_expanded_m2n1(float *C,
float32x4x2_t acc, float32x4x4_t expanded_alpha) {
float32x4_t ld1 = vld1q_f32(C);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
acc.val[0] = vrev64q_f32(acc.val[0]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
acc.val[1] = vrev64q_f32(acc.val[1]);
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
vst1q_f32(C, ld1);
}
static inline float32x4x2_t init_expanded_m2n1() {
float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }};
return ret;
}
static inline void kernel_8x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x4x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n1();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12),
a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20),
a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
c3 = acc_expanded_m2n1(c3, a3, b1);
c4 = acc_expanded_m2n1(c4, a4, b1);
c1 = acc_expanded_m2n1(c1, a5, b2);
c2 = acc_expanded_m2n1(c2, a6, b2);
c3 = acc_expanded_m2n1(c3, a7, b2);
c4 = acc_expanded_m2n1(c4, a8, b2);
}
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
float32x2_t b1 = vld1_f32(sb);
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
c3 = acc_expanded_m2n1(c3, a3, b1);
c4 = acc_expanded_m2n1(c4, a4, b1);
}
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n1(C, c1, expanded_alpha);
store_expanded_m2n1(C + 4, c2, expanded_alpha);
store_expanded_m2n1(C + 8, c3, expanded_alpha);
store_expanded_m2n1(C + 12, c4, expanded_alpha);
}
static inline void kernel_4x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x4x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n1();
for (; K > 1; K -= 2) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
c3 = acc_expanded_m2n1(c3, a3, b2);
c4 = acc_expanded_m2n1(c4, a4, b2);
}
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
if (K) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
float32x2_t b1 = vld1_f32(sb);
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b1);
}
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n1(C, c1, expanded_alpha);
store_expanded_m2n1(C + 4, c2, expanded_alpha);
}
static inline void kernel_2x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x4x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m2n1();
for (; K > 3; K -= 4) {
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2),
b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8;
c1 = acc_expanded_m2n1(c1, a1, b1);
c2 = acc_expanded_m2n1(c2, a2, b2);
c3 = acc_expanded_m2n1(c3, a3, b3);
c4 = acc_expanded_m2n1(c4, a4, b4);
}
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
for (; K; K--) {
float32x4_t a1 = vld1q_f32(sa); sa += 4;
float32x2_t b1 = vld1_f32(sb); sb += 2;
c1 = acc_expanded_m2n1(c1, a1, b1);
}
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
store_expanded_m2n1(C, c1, expanded_alpha);
}
static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) {
float32x2x4_t ret;
const float maskp[] = { -1, 1 };
const float maskn[] = { 1, -1 };
const float32x2_t vrevp = vld1_f32(maskp);
const float32x2_t vrevn = vld1_f32(maskn);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
ret.val[0] = vdup_n_f32(alphar);
ret.val[1] = vdup_n_f32(-alphai);
ret.val[2] = vmul_f32(ret.val[1], vrevn);
ret.val[3] = vmul_f32(ret.val[0], vrevp);
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
ret.val[0] = vdup_n_f32(alphar);
ret.val[1] = vdup_n_f32(alphai);
ret.val[2] = vmul_f32(ret.val[1], vrevp);
ret.val[3] = vmul_f32(ret.val[0], vrevn);
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
ret.val[2] = vdup_n_f32(alphai);
ret.val[3] = vdup_n_f32(alphar);
ret.val[0] = vmul_f32(ret.val[3], vrevn);
ret.val[1] = vmul_f32(ret.val[2], vrevp);
#else
ret.val[2] = vdup_n_f32(alphai);
ret.val[3] = vdup_n_f32(-alphar);
ret.val[0] = vmul_f32(ret.val[3], vrevp);
ret.val[1] = vmul_f32(ret.val[2], vrevn);
#endif
return ret;
}
static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc,
float32x2_t a, float32x2_t b) {
acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0);
acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1);
return acc;
}
static inline void store_expanded_m1n1(float *C,
float32x2x2_t acc, float32x2x4_t expanded_alpha) {
float32x2_t ld1 = vld1_f32(C);
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]);
acc.val[0] = vrev64_f32(acc.val[0]);
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]);
acc.val[1] = vrev64_f32(acc.val[1]);
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]);
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]);
vst1_f32(C, ld1);
}
static inline float32x2x2_t init_expanded_m1n1() {
float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }};
return ret;
}
static inline void kernel_1x4(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m1n1();
for (; K; K--) {
float32x2_t a1 = vld1_f32(sa); sa += 2;
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4));
c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6));
sb += 8;
}
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c4, expanded_alpha);
}
static inline void kernel_1x2(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
float32x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m1n1();
for (; K > 1; K -= 2) {
float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4;
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4));
c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6));
sb += 8;
}
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
if (K) {
float32x2_t a1 = vld1_f32(sa);
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
}
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
store_expanded_m1n1(C, c2, expanded_alpha);
}
static inline void kernel_1x1(const float *sa, const float *sb, float *C,
float alphar, float alphai, BLASLONG K) {
float32x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init_expanded_m1n1();
for (; K > 3; K -= 4) {
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2));
c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4));
c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6));
sa += 8; sb += 8;
}
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
c1.val[0] = vadd_f32(c1.val[0], c2.val[0]);
c1.val[1] = vadd_f32(c1.val[1], c2.val[1]);
for (; K; K--) {
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
sa += 2; sb += 2;
}
store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai));
}
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
BLASLONG n_left = N;
for (; n_left >= 8; n_left -= 8) {
const FLOAT *a_ = sa;
FLOAT *c1_ = C;
FLOAT *c2_ = C + LDC * 8;
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + K * 8;
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC);
a_ += 16 * K;
c1_ += 16;
c2_ += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC);
a_ += 8 * K;
c1_ += 8;
c2_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC);
a_ += 4 * K;
c1_ += 4;
c2_ += 4;
}
if (m_left) {
kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC);
kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC);
}
C += 16 * LDC;
sb += 16 * K;
}
if (n_left >= 4) {
n_left -= 4;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 16 * K;
c_ += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC);
}
C += 8 * LDC;
sb += 8 * K;
}
if (n_left >= 2) {
n_left -= 2;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 16 * K;
c_ += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC);
}
C += 4 * LDC;
sb += 4 * K;
}
if (n_left) {
BLASLONG m_left = M;
for (; m_left >= 8; m_left -= 8) {
kernel_8x1(sa, sb, C, alphar, alphai, K);
sa += 16 * K;
C += 16;
}
if (m_left >= 4) {
m_left -= 4;
kernel_4x1(sa, sb, C, alphar, alphai, K);
sa += 8 * K;
C += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x1(sa, sb, C, alphar, alphai, K);
sa += 4 * K;
C += 4;
}
if (m_left) {
kernel_1x1(sa, sb, C, alphar, alphai, K);
}
}
return 0;
}

View File

@ -0,0 +1,890 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
/**********************************************************
* Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12
* Operation: C[4][12] += alpha * sa[4][K] * sb[K][12]
* Matrix orders:
* sa: column-major (leading dimension == 4)
* sb: 3 concatenated row-major 4-column submatrices
* C: column-major (leading dimension == LDC)
*********************************************************/
static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
/** prefetch 4x12 elements from matrix C for RW purpose */
__asm__ __volatile__(
"mov x0,%[C]\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t"
::[C]"r"(C), [LDC]"r"(LDC):"x0");
/** 3 pointers to 3 submatrices of sb respectively */
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + K * 4;
const FLOAT *b3_ = sb + K * 8;
/** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */
/** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */
/** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */
/** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */
/** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */
__asm__ __volatile__(
"cmp %[K],#0\n\t"
/** fill registers holding elements of C with 0.0 */
"movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t"
"movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t"
"movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t"
"movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t"
"movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t"
"movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t"
"beq 4f; cmp %[K],#2\n\t"
/** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */
"ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t"
"ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t"
"blt 3f; beq 2f\n\t"
"1:\n\t"
/** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
"fmla v12.2d,v0.2d,v5.d[0]\n\t"
"fmla v13.2d,v1.2d,v5.d[0]\n\t"
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
"fmla v18.2d,v0.2d,v6.d[1]\n\t"
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
"fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t"
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
"fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t"
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
"ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t"
"fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t"
"fmla v9.2d,v3.2d,v6.d[0]\n\t"
"fmla v10.2d,v2.2d,v6.d[1]\n\t"
"ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t"
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
"fmla v13.2d,v3.2d,v7.d[0]\n\t"
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
"fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t"
"fmla v16.2d,v2.2d,v4.d[0]\n\t"
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
"fmla v19.2d,v3.2d,v4.d[1]\n\t"
"ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t"
"fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t"
"fmla v21.2d,v3.2d,v5.d[0]\n\t"
"fmla v22.2d,v2.2d,v5.d[1]\n\t"
"ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t"
"fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t"
"fmla v24.2d,v2.2d,v6.d[0]\n\t"
"fmla v25.2d,v3.2d,v6.d[0]\n\t"
"ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t"
"fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t"
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
"fmla v28.2d,v2.2d,v7.d[0]\n\t"
"ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t"
"fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t"
"fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t"
"fmla v31.2d,v3.2d,v7.d[1]\n\t"
"bgt 1b; blt 3f\n\t"
"2:\n\t"
/** tail part with k = 2 */
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
"fmla v12.2d,v0.2d,v5.d[0]\n\t"
"fmla v13.2d,v1.2d,v5.d[0]\n\t"
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
"fmla v18.2d,v0.2d,v6.d[1]\n\t"
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
"fmla v24.2d,v0.2d,v4.d[0]\n\t"
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
"fmla v30.2d,v0.2d,v5.d[1]\n\t"
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
"fmov v4.d[1],x0\n\t"
"fmla v8.2d,v2.2d,v6.d[0]\n\t"
"fmla v9.2d,v3.2d,v6.d[0]\n\t"
"fmla v10.2d,v2.2d,v6.d[1]\n\t"
"ldr d5,[%[b2_],#48]\n\t"
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
"fmla v13.2d,v3.2d,v7.d[0]\n\t"
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
"fmla v15.2d,v3.2d,v7.d[1]\n\t"
"fmla v16.2d,v2.2d,v4.d[0]\n\t"
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
"fmla v19.2d,v3.2d,v4.d[1]\n\t"
"fmov v7.d[1],x0\n\t"
"fmla v20.2d,v2.2d,v5.d[0]\n\t"
"fmla v21.2d,v3.2d,v5.d[0]\n\t"
"fmla v22.2d,v2.2d,v5.d[1]\n\t"
"fmla v23.2d,v3.2d,v5.d[1]\n\t"
"fmla v24.2d,v2.2d,v6.d[0]\n\t"
"fmla v25.2d,v3.2d,v6.d[0]\n\t"
"fmla v26.2d,v2.2d,v6.d[1]\n\t"
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
"fmla v28.2d,v2.2d,v7.d[0]\n\t"
"fmla v29.2d,v3.2d,v7.d[0]\n\t"
"fmla v30.2d,v2.2d,v7.d[1]\n\t"
"fmla v31.2d,v3.2d,v7.d[1]\n\t"
"b 4f\n\t"
"3:\n\t"
/** tail part with k = 1 */
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
"fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t"
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
"fmov v7.d[1],x0\n\t"
"fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t"
"fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t"
"fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t"
"ldr d4,[%[b3_]]\n\t"
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
"fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t"
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
"fmov v5.d[1],x0\n\t"
"fmla v20.2d,v0.2d,v7.d[0]\n\t"
"fmla v21.2d,v1.2d,v7.d[0]\n\t"
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
"fmla v23.2d,v1.2d,v7.d[1]\n\t"
"fmla v24.2d,v0.2d,v4.d[0]\n\t"
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
"fmla v26.2d,v0.2d,v4.d[1]\n\t"
"fmla v27.2d,v1.2d,v4.d[1]\n\t"
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
"fmla v29.2d,v1.2d,v5.d[0]\n\t"
"fmla v30.2d,v0.2d,v5.d[1]\n\t"
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
/** store 4x12 elements to C */
"4:\n\t"
"ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t"
"fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t"
"fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t"
"fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t"
"fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t"
"fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
"fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t"
"fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t"
"stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t"
:[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K)
:[LDC]"r"(LDC), [alpha]"m"(alpha)
:"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
/**********************************************************
* Operation:
C[0] += alpha * up[0]; C[1] += alpha * up[1];
C[2] += alpha * down[0]; C[3] += alpha * down[1];
*********************************************************/
static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) {
float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2);
t1 = vfmaq_n_f64(t1, up, alpha);
t2 = vfmaq_n_f64(t2, down, alpha);
vst1q_f64(C, t1);
vst1q_f64(C + 2, t2);
}
/**********************************************************
* Function: dgemm_kernel_arm64_4x4_m4n8
* Operation: C[4][8] += alpha * sa[4][K] * sb[K][8]
* Matrix orders:
* sa: column-major (leading dimension == 4)
* sb: 2 concatenated row-major 4-column submatrices
* C: column-major (leading dimension == LDC)
*********************************************************/
static inline void dgemm_kernel_arm64_4x4_m4n8(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + K * 4;
/** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */
float64x2_t c11, c12, c13, c14, c15, c16, c17, c18;
float64x2_t c21, c22, c23, c24, c25, c26, c27, c28;
c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0);
c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0);
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa);
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(b1_);
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4;
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
float64x2_t b3 = vld1q_f64(b2_);
c15 = vfmaq_laneq_f64(c15, a1, b3, 0);
c25 = vfmaq_laneq_f64(c25, a2, b3, 0);
c16 = vfmaq_laneq_f64(c16, a1, b3, 1);
c26 = vfmaq_laneq_f64(c26, a2, b3, 1);
float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4;
c17 = vfmaq_laneq_f64(c17, a1, b4, 0);
c27 = vfmaq_laneq_f64(c27, a2, b4, 0);
c18 = vfmaq_laneq_f64(c18, a1, b4, 1);
c28 = vfmaq_laneq_f64(c28, a2, b4, 1);
}
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
dgemm_store_m4n1(C, c14, c24, alpha); C += LDC;
dgemm_store_m4n1(C, c15, c25, alpha); C += LDC;
dgemm_store_m4n1(C, c16, c26, alpha); C += LDC;
dgemm_store_m4n1(C, c17, c27, alpha); C += LDC;
dgemm_store_m4n1(C, c18, c28, alpha);
}
/**********************************************************
* Function: dgemm_kernel_arm64_4x4_m4n4
* Operation: C[4][4] += alpha * sa[4][K] * sb[K][4]
* Matrix orders:
* sa: column-major (leading dimension == 4)
* sb: row-major (leading dimension == 4)
* C: column-major (leading dimension == LDC)
*********************************************************/
static inline void dgemm_kernel_arm64_4x4_m4n4(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c11, c21, c12, c22, c13, c23, c14, c24;
c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0);
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa);
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb);
float64x2_t b2 = vld1q_f64(sb + 2); sb += 4;
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
}
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
dgemm_store_m4n1(C, c14, c24, alpha);
}
static inline void dgemm_kernel_arm64_4x4_m4n2(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2;
c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2),
a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8;
c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0);
c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0);
c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1);
c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1);
c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0);
c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0);
c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1);
c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1);
}
c11_1 = vaddq_f64(c11_1, c11_2);
c21_1 = vaddq_f64(c21_1, c21_2);
c12_1 = vaddq_f64(c12_1, c12_2);
c22_1 = vaddq_f64(c22_1, c22_2);
if (K) {
float64x2_t b1 = vld1q_f64(sb); sb += 2;
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0);
c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0);
c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1);
c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1);
}
dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC;
dgemm_store_m4n1(C, c12_1, c22_1, alpha);
}
static inline void dgemm_kernel_arm64_4x4_m4n1(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c11_1, c11_2, c21_1, c21_2;
c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t b1 = vld1q_f64(sb); sb += 2;
c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0);
c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0);
c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1);
c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1);
sa += 8;
}
c11_1 = vaddq_f64(c11_1, c11_2);
c21_1 = vaddq_f64(c21_1, c21_2);
if (K) {
double b1 = *sb++;
c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1);
c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1);
sa += 4;
}
dgemm_store_m4n1(C, c11_1, c21_1, alpha);
}
static inline void dgemm_kernel_arm64_4x4_m2n12(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24;
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 =
c21 = c22 = c23 = c24 = vdupq_n_f64(0);
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + 4 * K;
const FLOAT *b3_ = b2_ + 4 * K;
for (; K; K--) {
const float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4;
c21 = vfmaq_laneq_f64(c21, a1, b1, 0);
c22 = vfmaq_laneq_f64(c22, a1, b1, 1);
c23 = vfmaq_laneq_f64(c23, a1, b2, 0);
c24 = vfmaq_laneq_f64(c24, a1, b2, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n8(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14;
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0);
const FLOAT *b1_ = sb;
const FLOAT *b2_ = sb + 4 * K;
for (; K; K--) {
const float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n4(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2;
c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2);
float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1);
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0);
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1);
c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0);
c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1);
c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0);
c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1);
}
c1_1 = vaddq_f64(c1_1, c1_2);
c2_1 = vaddq_f64(c2_1, c2_2);
c3_1 = vaddq_f64(c3_1, c3_2);
c4_1 = vaddq_f64(c4_1, c4_2);
if (K) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0);
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n2(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1_1, c1_2, c2_1, c2_2;
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0);
c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1);
}
c1_1 = vaddq_f64(c1_1, c1_2);
c2_1 = vaddq_f64(c2_1, c2_2);
if (K) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
float64x2_t b1 = vld1q_f64(sb); sb += 2;
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha));
}
static inline void dgemm_kernel_arm64_4x4_m2n1(
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
for (; K > 3; K -= 4) {
float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4;
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0);
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1);
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0);
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1);
sa += 8;
}
c1 = vaddq_f64(c1, c2);
c3 = vaddq_f64(c3, c4);
c1 = vaddq_f64(c1, c3);
for (; K; K--) {
c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++);
sa += 2;
}
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha));
}
static inline void dgemm_store_m1n2(double *C, float64x2_t vc,
double alpha, BLASLONG LDC) {
double c0 = vgetq_lane_f64(vc, 0);
double c1 = vgetq_lane_f64(vc, 1);
C[0] += c0 * alpha;
C[LDC] += c1 * alpha;
}
static inline void dgemm_kernel_arm64_4x4_m1n12(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4, c5, c6;
c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0);
const double *b1_ = sb;
const double *b2_ = sb + 4 * K;
const double *b3_ = b2_ + 4 * K;
for (; K; K--) {
const double a1 = *sa++;
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1);
c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4;
}
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c6, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n8(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
const double *b1_ = sb;
const double *b2_ = sb + 4 * K;
for (; K; K--) {
const double a1 = *sa++;
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
}
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c4, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n4(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1_1, c1_2, c2_1, c2_2;
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0);
c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0);
c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1);
c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8;
}
c1_1 = vaddq_f64(c1_1, c1_2);
c2_1 = vaddq_f64(c2_1, c2_2);
if (K) {
double a1 = *sa++;
c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1);
c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1);
sb += 4;
}
dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2;
dgemm_store_m1n2(C, c2_1, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n2(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
for (; K > 3; K -= 4) {
float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4;
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0);
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1);
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0);
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8;
}
c1 = vaddq_f64(c1, c2);
c3 = vaddq_f64(c3, c4);
c1 = vaddq_f64(c1, c3);
for (; K; K--) {
c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++);
sb += 2;
}
dgemm_store_m1n2(C, c1, alpha, LDC);
}
static inline void dgemm_kernel_arm64_4x4_m1n1(
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
float64x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
for (; K > 7; K -= 8) {
c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa));
c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2));
c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4));
c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6));
sa += 8; sb += 8;
}
c1 = vaddq_f64(c1, c2);
c3 = vaddq_f64(c3, c4);
c1 = vaddq_f64(c1, c3);
double cs1 = vpaddd_f64(c1);
for (; K; K--) {
cs1 += (*sa++) * (*sb++);
}
C[0] += cs1 * alpha;
}
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
for (; N >= 12; N -= 12) {
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha);
}
sb += 12 * K;
C += 12 * LDC;
}
if (N >= 8) {
N -= 8;
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha);
}
sb += 8 * K;
C += 8 * LDC;
} else if (N >= 4) {
N -= 4;
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha);
}
sb += 4 * K;
C += 4 * LDC;
}
if (N >= 2) {
N -= 2;
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha);
}
sb += 2 * K;
C += 2 * LDC;
}
if (N) {
BLASLONG m_left = M;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
for (; m_left >= 4; m_left -= 4) {
dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha);
c_ += 4;
a_ += 4 * K;
}
if (m_left >= 2) {
m_left -= 2;
dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha);
c_ += 2;
a_ += 2 * K;
}
if (m_left) {
dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha);
}
}
return 0;
}

View File

@ -0,0 +1,874 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* X0 X1 X2 s0 X3 x4 x5 x6 */
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
#define origM x0
#define origN x1
#define origK x2
#define origPA x3
#define origPB x4
#define pC x5
#define LDC x6
#define temp x7
#define counterL x8
#define counterI x9
#define counterJ x10
#define pB x11
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define lanes x15
#define pA x16
#define alpha x17
#define alpha0 d10
#define alphaZ z2.d
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
// 02 origK
// 03 origPA
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 temp
// 08 counterL
// 09 counterI
// 10 counterJ
// 11 pB
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 lanes
// 16 pA
// 17
// 18 must save
// 19 must save
// 20 must save
// 21 must save
// 22 must save
// 23 must save
// 24 must save
// 25 must save
// 26 must save
// 27 must save
// 28 must save
// 29 frame
// 30 link
// 31 sp
//v00 ALPHA -> pA0_0
//v01 pA0_1
//v02 ALPHA0
//v03
//v04
//v05
//v06
//v07
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2
//v11 must save pB0_3
//v12 must save pB0_4
//v13 must save pB0_5
//v14 must save pB0_6
//v15 must save pB0_7
//v16 must save C0
//v17 must save C1
//v18 must save C2
//v19 must save C3
//v20 must save C4
//v21 must save C5
//v22 must save C6
//v23 must save C7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
.macro INITv1x8
dup z16.d, #0
dup z17.d, #0
dup z18.d, #0
dup z19.d, #0
dup z20.d, #0
dup z21.d, #0
dup z22.d, #0
dup z23.d, #0
.endm
.macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
ld1rd z12.d, p0/z, [pB, 32]
ld1rd z13.d, p0/z, [pB, 40]
ld1rd z14.d, p0/z, [pB, 48]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z0.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z0.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_M1
ld1d z1.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z0.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z0.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_M2
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z1.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z1.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z1.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z1.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z1.d, z12.d
ld1rd z12.d, p0/z, [pB, 32]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z1.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z1.d, z15.d
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_E
fmla z16.d, p1/m, z1.d, z8.d
fmla z17.d, p1/m, z1.d, z9.d
fmla z18.d, p1/m, z1.d, z10.d
fmla z19.d, p1/m, z1.d, z11.d
fmla z20.d, p1/m, z1.d, z12.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
fmla z22.d, p1/m, z1.d, z14.d
fmla z23.d, p1/m, z1.d, z15.d
.endm
.macro KERNELv1x8_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
ld1rd z12.d, p0/z, [pB, 32]
ld1rd z13.d, p0/z, [pB, 40]
ld1rd z14.d, p0/z, [pB, 48]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
fmla z18.d, p1/m, z0.d, z10.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z19.d, p1/m, z0.d, z11.d
fmla z20.d, p1/m, z0.d, z12.d
fmla z21.d, p1/m, z0.d, z13.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z22.d, p1/m, z0.d, z14.d
fmla z23.d, p1/m, z0.d, z15.d
.endm
.macro SAVEv1x8
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z28.d, p1/z, [pCRow2]
fmla z28.d, p1/m, z20.d, alphaZ
st1d z28.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z29.d, p1/z, [pCRow1]
fmla z29.d, p1/m, z21.d, alphaZ
st1d z29.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z30.d, p1/z, [pCRow2]
fmla z30.d, p1/m, z22.d, alphaZ
st1d z30.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z31.d, p1/z, [pCRow1]
fmla z31.d, p1/m, z23.d, alphaZ
st1d z31.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x4
dup z16.d, #0
dup z17.d, #0
dup z18.d, #0
dup z19.d, #0
.endm
.macro KERNELv1x4_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
add pB, pB, 32
fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z18.d, p1/m, z0.d, z10.d
fmla z19.d, p1/m, z0.d, z11.d
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x2
dup z16.d, #0
dup z17.d, #0
.endm
.macro KERNELv1x2_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
add pB, pB, 16
fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z17.d, p1/m, z0.d, z9.d
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x1
dup z16.d, #0
.endm
.macro KERNELv1x1_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
add pB, pB, 8
fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
.align 5
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0
dup alphaZ, alpha
lsl LDC, LDC, #3 // ldc = ldc * 8
ptrue p0.d // create true predicate
mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC
add pC, pC, LDC, lsl #3 // add 8 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L8_Mv1_BEGIN:
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0
whilelt p1.d, counterI, origM
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
.align 5
.Ldgemm_kernel_L8_Mv1_20:
mov pB, origPB
INITv1x8 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt .Ldgemm_kernel_L8_Mv1_32
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #2 // subtract 2
ble .Ldgemm_kernel_L8_Mv1_22a
.align 5
.Ldgemm_kernel_L8_Mv1_22:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L8_Mv1_22
.align 5
.Ldgemm_kernel_L8_Mv1_22a:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.align 5
.Ldgemm_kernel_L8_Mv1_32:
tst counterL, #1
ble .Ldgemm_kernel_L8_Mv1_40
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.Ldgemm_kernel_L8_Mv1_40:
INITv1x8
.Ldgemm_kernel_L8_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L8_Mv1_100
.align 5
.Ldgemm_kernel_L8_Mv1_46:
KERNELv1x8_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L8_Mv1_46
.Ldgemm_kernel_L8_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x8
.Ldgemm_kernel_L8_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
b.any .Ldgemm_kernel_L8_Mv1_20
.Ldgemm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
subs counterJ, counterJ , #1 // j--
bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
/* Repeat the same thing if 4 left in N */
.align 5
.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #4
ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #2 // add 4 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L4_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L4_Mv1_20:
mov pB, origPB
INITv1x4 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L4_Mv1_44
.align 5
.Ldgemm_kernel_L4_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L4_Mv1_22
.Ldgemm_kernel_L4_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L4_Mv1_100
.align 5
.Ldgemm_kernel_L4_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L4_Mv1_46
.Ldgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Ldgemm_kernel_L4_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L4_Mv1_20
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/
/* Repeat the same thing if 2 left in N */
.align 5
.Ldgemm_kernel_L2_BEGIN:
mov counterJ , origN
tst counterJ , #2
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #1 // add 2 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L2_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L2_Mv1_20:
mov pB, origPB
INITv1x2 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L2_Mv1_44
.align 5
.Ldgemm_kernel_L2_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L2_Mv1_22
.Ldgemm_kernel_L2_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L2_Mv1_100
.align 5
.Ldgemm_kernel_L2_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L2_Mv1_46
.Ldgemm_kernel_L2_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x2
.Ldgemm_kernel_L2_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L2_Mv1_20
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
/* Repeat the same thing if 1 left in N */
.align 5
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC
add pC, pC, LDC // add 1 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L1_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L1_Mv1_20:
mov pB, origPB
INITv1x1 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 8 to do?
ble .Ldgemm_kernel_L1_Mv1_44
.align 5
.Ldgemm_kernel_L1_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_22
.Ldgemm_kernel_L1_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L1_Mv1_100
.align 5
.Ldgemm_kernel_L1_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_46
.Ldgemm_kernel_L1_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x1
.Ldgemm_kernel_L1_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L1_Mv1_20
.Ldgemm_kernel_L1_END:
/******************************************************************************/
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,79 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -0,0 +1,77 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,874 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* X0 X1 X2 s0 X3 x4 x5 x6 */
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
#define origM x0
#define origN x1
#define origK x2
#define origPA x3
#define origPB x4
#define pC x5
#define LDC x6
#define temp x7
#define counterL x8
#define counterI x9
#define counterJ x10
#define pB x11
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define lanes x15
#define pA x16
#define alpha w17
#define alpha0 s10
#define alphaZ z2.s
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
// 02 origK
// 03 origPA
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 temp
// 08 counterL
// 09 counterI
// 10 counterJ
// 11 pB
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 lanes
// 16 pA
// 17
// 18 must save
// 19 must save
// 20 must save
// 21 must save
// 22 must save
// 23 must save
// 24 must save
// 25 must save
// 26 must save
// 27 must save
// 28 must save
// 29 frame
// 30 link
// 31 sp
//v00 ALPHA -> pA0_0
//v01 pA0_1
//v02 ALPHA0
//v03
//v04
//v05
//v06
//v07
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2
//v11 must save pB0_3
//v12 must save pB0_4
//v13 must save pB0_5
//v14 must save pB0_6
//v15 must save pB0_7
//v16 must save C0
//v17 must save C1
//v18 must save C2
//v19 must save C3
//v20 must save C4
//v21 must save C5
//v22 must save C6
//v23 must save C7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
.macro INITv1x8
dup z16.s, #0
dup z17.s, #0
dup z18.s, #0
dup z19.s, #0
dup z20.s, #0
dup z21.s, #0
dup z22.s, #0
dup z23.s, #0
.endm
.macro KERNELv1x8_I
ld1w z0.s, p1/z, [pA]
ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
ld1rw z10.s, p0/z, [pB, 8]
ld1rw z11.s, p0/z, [pB, 12]
ld1rw z12.s, p0/z, [pB, 16]
ld1rw z13.s, p0/z, [pB, 20]
ld1rw z14.s, p0/z, [pB, 24]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
fmla z16.s, p1/m, z0.s, z8.s
ld1rw z8.s, p0/z, [pB]
fmla z17.s, p1/m, z0.s, z9.s
ld1rw z9.s, p0/z, [pB, 4]
fmla z18.s, p1/m, z0.s, z10.s
ld1rw z10.s, p0/z, [pB, 8]
fmla z19.s, p1/m, z0.s, z11.s
ld1rw z11.s, p0/z, [pB, 12]
fmla z20.s, p1/m, z0.s, z12.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rw z12.s, p0/z, [pB, 16]
fmla z21.s, p1/m, z0.s, z13.s
ld1rw z13.s, p0/z, [pB, 20]
fmla z22.s, p1/m, z0.s, z14.s
ld1rw z14.s, p0/z, [pB, 24]
fmla z23.s, p1/m, z0.s, z15.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
.endm
.macro KERNELv1x8_M1
ld1w z1.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
fmla z16.s, p1/m, z0.s, z8.s
ld1rw z8.s, p0/z, [pB]
fmla z17.s, p1/m, z0.s, z9.s
ld1rw z9.s, p0/z, [pB, 4]
fmla z18.s, p1/m, z0.s, z10.s
ld1rw z10.s, p0/z, [pB, 8]
fmla z19.s, p1/m, z0.s, z11.s
ld1rw z11.s, p0/z, [pB, 12]
fmla z20.s, p1/m, z0.s, z12.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rw z12.s, p0/z, [pB, 16]
fmla z21.s, p1/m, z0.s, z13.s
ld1rw z13.s, p0/z, [pB, 20]
fmla z22.s, p1/m, z0.s, z14.s
ld1rw z14.s, p0/z, [pB, 24]
fmla z23.s, p1/m, z0.s, z15.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
.endm
.macro KERNELv1x8_M2
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
fmla z16.s, p1/m, z1.s, z8.s
ld1rw z8.s, p0/z, [pB]
fmla z17.s, p1/m, z1.s, z9.s
ld1rw z9.s, p0/z, [pB, 4]
fmla z18.s, p1/m, z1.s, z10.s
ld1rw z10.s, p0/z, [pB, 8]
fmla z19.s, p1/m, z1.s, z11.s
ld1rw z11.s, p0/z, [pB, 12]
fmla z20.s, p1/m, z1.s, z12.s
ld1rw z12.s, p0/z, [pB, 16]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.s, p1/m, z1.s, z13.s
ld1rw z13.s, p0/z, [pB, 20]
fmla z22.s, p1/m, z1.s, z14.s
ld1rw z14.s, p0/z, [pB, 24]
fmla z23.s, p1/m, z1.s, z15.s
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
.endm
.macro KERNELv1x8_E
fmla z16.s, p1/m, z1.s, z8.s
fmla z17.s, p1/m, z1.s, z9.s
fmla z18.s, p1/m, z1.s, z10.s
fmla z19.s, p1/m, z1.s, z11.s
fmla z20.s, p1/m, z1.s, z12.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.s, p1/m, z1.s, z13.s
fmla z22.s, p1/m, z1.s, z14.s
fmla z23.s, p1/m, z1.s, z15.s
.endm
.macro KERNELv1x8_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
ld1rw z10.s, p0/z, [pB, 8]
ld1rw z11.s, p0/z, [pB, 12]
ld1rw z12.s, p0/z, [pB, 16]
ld1rw z13.s, p0/z, [pB, 20]
ld1rw z14.s, p0/z, [pB, 24]
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
fmla z16.s, p1/m, z0.s, z8.s
fmla z17.s, p1/m, z0.s, z9.s
fmla z18.s, p1/m, z0.s, z10.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z19.s, p1/m, z0.s, z11.s
fmla z20.s, p1/m, z0.s, z12.s
fmla z21.s, p1/m, z0.s, z13.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z22.s, p1/m, z0.s, z14.s
fmla z23.s, p1/m, z0.s, z15.s
.endm
.macro SAVEv1x8
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z25.s, p1/z, [pCRow1]
fmla z25.s, p1/m, z17.s, alphaZ
st1w z25.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z26.s, p1/z, [pCRow2]
fmla z26.s, p1/m, z18.s, alphaZ
st1w z26.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z27.s, p1/z, [pCRow1]
fmla z27.s, p1/m, z19.s, alphaZ
st1w z27.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z28.s, p1/z, [pCRow2]
fmla z28.s, p1/m, z20.s, alphaZ
st1w z28.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z29.s, p1/z, [pCRow1]
fmla z29.s, p1/m, z21.s, alphaZ
st1w z29.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z30.s, p1/z, [pCRow2]
fmla z30.s, p1/m, z22.s, alphaZ
st1w z30.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1w z31.s, p1/z, [pCRow1]
fmla z31.s, p1/m, z23.s, alphaZ
st1w z31.s, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/******************************************************************************/
.macro INITv1x4
dup z16.s, #0
dup z17.s, #0
dup z18.s, #0
dup z19.s, #0
.endm
.macro KERNELv1x4_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
ld1rw z10.s, p0/z, [pB, 8]
ld1rw z11.s, p0/z, [pB, 12]
add pB, pB, 16
fmla z16.s, p1/m, z0.s, z8.s
fmla z17.s, p1/m, z0.s, z9.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z18.s, p1/m, z0.s, z10.s
fmla z19.s, p1/m, z0.s, z11.s
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1w z25.s, p1/z, [pCRow1]
fmla z25.s, p1/m, z17.s, alphaZ
st1w z25.s, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1w z26.s, p1/z, [pCRow2]
fmla z26.s, p1/m, z18.s, alphaZ
st1w z26.s, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1w z27.s, p1/z, [pCRow1]
fmla z27.s, p1/m, z19.s, alphaZ
st1w z27.s, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/******************************************************************************/
.macro INITv1x2
dup z16.s, #0
dup z17.s, #0
.endm
.macro KERNELv1x2_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
ld1rw z8.s, p0/z, [pB]
ld1rw z9.s, p0/z, [pB, 4]
add pB, pB, 8
fmla z16.s, p1/m, z0.s, z8.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z17.s, p1/m, z0.s, z9.s
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1w z25.s, p1/z, [pCRow1]
fmla z25.s, p1/m, z17.s, alphaZ
st1w z25.s, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/******************************************************************************/
.macro INITv1x1
dup z16.s, #0
.endm
.macro KERNELv1x1_SUB
ld1w z0.s, p1/z, [pA]
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8
ld1rw z8.s, p0/z, [pB]
add pB, pB, 4
fmla z16.s, p1/m, z0.s, z8.s
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1w z24.s, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaZ
st1w z24.s, p1, [pCRow0]
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
.align 5
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, s0
dup alphaZ, alpha
lsl LDC, LDC, #2 // ldc = ldc * 4
ptrue p0.s // create true predicate
mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC
add pC, pC, LDC, lsl #3 // add 8 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L8_Mv1_BEGIN:
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0
whilelt p1.s, counterI, origM
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
.align 5
.Ldgemm_kernel_L8_Mv1_20:
mov pB, origPB
INITv1x8 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt .Ldgemm_kernel_L8_Mv1_32
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #2 // subtract 2
ble .Ldgemm_kernel_L8_Mv1_22a
.align 5
.Ldgemm_kernel_L8_Mv1_22:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L8_Mv1_22
.align 5
.Ldgemm_kernel_L8_Mv1_22a:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.align 5
.Ldgemm_kernel_L8_Mv1_32:
tst counterL, #1
ble .Ldgemm_kernel_L8_Mv1_40
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.Ldgemm_kernel_L8_Mv1_40:
INITv1x8
.Ldgemm_kernel_L8_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L8_Mv1_100
.align 5
.Ldgemm_kernel_L8_Mv1_46:
KERNELv1x8_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L8_Mv1_46
.Ldgemm_kernel_L8_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x8
.Ldgemm_kernel_L8_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
b.any .Ldgemm_kernel_L8_Mv1_20
.Ldgemm_kernel_L8_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 8 * 4
subs counterJ, counterJ , #1 // j--
bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
/* Repeat the same thing if 4 left in N */
.align 5
.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #4
ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #2 // add 4 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L4_Mv1_BEGIN:
mov counterI, #0
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
.align 5
.Ldgemm_kernel_L4_Mv1_20:
mov pB, origPB
INITv1x4 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L4_Mv1_44
.align 5
.Ldgemm_kernel_L4_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L4_Mv1_22
.Ldgemm_kernel_L4_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L4_Mv1_100
.align 5
.Ldgemm_kernel_L4_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L4_Mv1_46
.Ldgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Ldgemm_kernel_L4_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
b.any .Ldgemm_kernel_L4_Mv1_20
.Ldgemm_kernel_L4_END:
lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4
/******************************************************************************/
/* Repeat the same thing if 2 left in N */
.align 5
.Ldgemm_kernel_L2_BEGIN:
mov counterJ , origN
tst counterJ , #2
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #1 // add 2 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L2_Mv1_BEGIN:
mov counterI, #0
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
.align 5
.Ldgemm_kernel_L2_Mv1_20:
mov pB, origPB
INITv1x2 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L2_Mv1_44
.align 5
.Ldgemm_kernel_L2_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L2_Mv1_22
.Ldgemm_kernel_L2_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L2_Mv1_100
.align 5
.Ldgemm_kernel_L2_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L2_Mv1_46
.Ldgemm_kernel_L2_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x2
.Ldgemm_kernel_L2_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
b.any .Ldgemm_kernel_L2_Mv1_20
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
/* Repeat the same thing if 1 left in N */
.align 5
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC
add pC, pC, LDC // add 1 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L1_Mv1_BEGIN:
mov counterI, #0
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
.align 5
.Ldgemm_kernel_L1_Mv1_20:
mov pB, origPB
INITv1x1 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 8 to do?
ble .Ldgemm_kernel_L1_Mv1_44
.align 5
.Ldgemm_kernel_L1_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_22
.Ldgemm_kernel_L1_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L1_Mv1_100
.align 5
.Ldgemm_kernel_L1_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_46
.Ldgemm_kernel_L1_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x1
.Ldgemm_kernel_L1_Mv1_END:
incw counterI
whilelt p1.s, counterI, origM //SVE instruction
cntp lanes, p0, p1.s
b.any .Ldgemm_kernel_L1_Mv1_20
.Ldgemm_kernel_L1_END:
/******************************************************************************/
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,78 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint32_t lda_vec = svindex_s32(0LL, lda);
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

View File

@ -0,0 +1,77 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,143 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, offset;
#if defined(DOUBLE)
uint64_t sve_size = svcntd();
svint64_t posY_vec = svdup_s64(posY);
svint64_t posX_vec = svdup_s64(posX);
svint64_t lda_vec = svdup_s64(lda);
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
do {
offset = posX - posY;
svint64_t vec_off = svdup_s64(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint64_t temp = svadd_z(pg, posX_vec, index);
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint64_t gat_ind = svsel(cmp, temp1, temp2);
i = m;
while (i>0) {
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
#else
uint32_t sve_size = svcntw();
svint32_t posY_vec = svdup_s32(posY);
svint32_t posX_vec = svdup_s32(posX);
svint32_t lda_vec = svdup_s32(lda);
svint32_t one_vec = svdup_s32(1);
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
do {
offset = posX - posY;
svint32_t vec_off = svdup_s32(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint32_t temp = svadd_z(pg, posX_vec, index);
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint32_t gat_ind = svsel(cmp, temp1, temp2);
i = m;
while (i>0) {
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
#endif
return 0;
}

View File

@ -0,0 +1,143 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, offset;
#if defined(DOUBLE)
uint64_t sve_size = svcntd();
svint64_t posY_vec = svdup_s64(posY);
svint64_t posX_vec = svdup_s64(posX);
svint64_t lda_vec = svdup_s64(lda);
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
do {
offset = posX - posY;
svint64_t vec_off = svdup_s64(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint64_t temp = svadd_z(pg, posX_vec, index);
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint64_t gat_ind = svsel(cmp, temp2, temp1);
i = m;
while (i>0) {
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, one_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
#else
uint32_t sve_size = svcntw();
svint32_t posY_vec = svdup_s32(posY);
svint32_t posX_vec = svdup_s32(posX);
svint32_t lda_vec = svdup_s32(lda);
svint32_t one_vec = svdup_s32(1);
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
do {
offset = posX - posY;
svint32_t vec_off = svdup_s32(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint32_t temp = svadd_z(pg, posX_vec, index);
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint32_t gat_ind = svsel(cmp, temp2, temp1);
i = m;
while (i>0) {
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, one_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
#endif
return 0;
}

View File

@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
js = 0;
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posY + posX * lda;
} else {
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
#else
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
#endif
svst1(pn, b, aj_vec);
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X < posY) {
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = *(ao+k*lda+j);
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k <= j; k++) {
b[temp++] = *(ao+k*lda+j);
}
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#endif
ao += n_active;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posY + posX * lda;
} else {
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY) {
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X < posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1(pn, ao);
#else
svfloat32_t aj_vec = svld1(pn, ao);
#endif
svst1(pn, b, aj_vec);
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = *(ao+j*lda+k);
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
for (int k = j; k < n_active; k++) {
b[temp++] = *(ao+j*lda+k);
}
}
#endif
ao += n_active * lda;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
js = 0;
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posX + posY * lda;
} else {
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
#else
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
#endif
svst1(pn, b, aj_vec);
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X > posY) {
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = *(ao+k*lda+j);
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
for (int k = j; k < n_active; k++) {
b[temp++] = *(ao+k*lda+j);
}
}
#endif
ao += n_active;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,134 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
{
X = posX;
if (posX <= posY) {
ao = a + posX + posY * lda;
} else {
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY) {
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X > posY) {
#ifdef DOUBLE
svfloat64_t aj_vec = svld1(pn, ao);
#else
svfloat32_t aj_vec = svld1(pn, ao);
#endif
svst1(pn, b, aj_vec);
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = *(ao+j*lda+k);
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k <= j; k++) {
b[temp++] = *(ao+j*lda+k);
}
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#endif
ao += n_active * lda;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif
return 0;
}

View File

@ -0,0 +1,736 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
/*******************************************************************************
The complex GEMM kernels in OpenBLAS use static configuration of conjugation
modes via specific macros:
MACRO_NAME | conjugation on matrix A | conjugation on matrix B |
---------- | ----------------------- | ----------------------- |
NN/NT/TN/TT | No | No |
NR/NC/TR/TC | No | Yes |
RN/RT/CN/CT | Yes | No |
RR/RC/CR/CC | Yes | Yes |
"conjugation on matrix A" means the complex conjugates of elements from
matrix A are used for matmul (rather than the original elements). "conjugation
on matrix B" means the complex conjugate of each element from matrix B is taken
for matrix multiplication, respectively.
Complex numbers in arrays or matrices are usually packed together as an
array of struct (without padding):
struct complex_number {
FLOAT real_part;
FLOAT imag_part;
};
For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF
DOUBLE, the real part of its Kth complex number can be accessed as
ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1].
This file uses 2 ways to vectorize matrix multiplication of complex numbers:
(1) Expanded-form
During accumulation along direction K:
Σk(a[0][k].real b[k][n].real)
accumulate Σk(a[0][k].imag b[k][n].real)
-------------------> .
| * b[k][n].real .
| (broadcasted) .
a[0][k].real Σk(a[v-1][k].real b[k][n].real)
a[0][k].imag Σk(a[v-1][k].imag b[k][n].real)
. VECTOR I
(vec_a) .
.
a[v-1][k].real Σk(a[0][k].real b[k][n].imag)
a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag)
| .
| accumulate .
-------------------> .
* b[k][n].imag Σk(a[v-1][k].real b[k][n].imag)
(broadcasted) Σk(a[v-1][k].imag b[k][n].imag)
VECTOR II
After accumulation, prior to storage:
-1 -Σk(a[0][k].imag b[k][n].imag)
1 Σk(a[0][k].real b[k][n].imag)
. .
VECTOR II permute and multiply . to get .
. .
-1 -Σk(a[v-1][k].imag b[k][n].imag)
1 Σk(a[v-1][k].real b[k][n].imag)
then add with VECTOR I to get the result vector of elements of C.
2 vector registers are needed for every v elements of C, with
v == sizeof(vector) / sizeof(complex)
(2) Contracted-form
During accumulation along direction K:
(the K coordinate is not shown, since the operation is identical for each k)
(load vector in mem) (load vector in mem)
a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i
| |
| unzip operation (or VLD2 in arm neon) |
-----------------------------------------------------
|
|
--------------------------------------------------
| |
| |
v v
a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag
| | | |
| | * b[i].imag(broadcast) | |
* b[i].real | -----------------------------|---- | * b[i].real
(broadcast) | | | | (broadcast)
| ------------------------------ | |
+ | - | * b[i].imag(broadcast) + | + |
v v v v
(accumulate) (accumulate)
c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag
VECTOR_REAL VECTOR_IMAG
After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved)
then stored to matrix C directly.
For 2v elements of C, only 2 vector registers are needed, while
4 registers are required for expanded-form.
(v == sizeof(vector) / sizeof(complex))
For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers
to store elements of C when using expanded-form calculation, where
the register spilling will occur. So contracted-form operation is
selected for 4x4 kernel. As for all other combinations of unroll parameters
(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more
NEON registers into usage to hide latency of multiply-add instructions.
******************************************************************************/
static inline float64x2_t set_f64x2(double lo, double hi) {
float64x2_t ret = vdupq_n_f64(0);
ret = vsetq_lane_f64(lo, ret, 0);
ret = vsetq_lane_f64(hi, ret, 1);
return ret;
}
static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) {
float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }};
return ret;
}
/*****************************************************************
* operation: *c += alpha * c_value //complex multiplication
* expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r }
* expanded_c: {{ arbr, aibr }, { arbi, aibi }}
****************************************************************/
static inline void store_1c(double *c, float64x2x2_t expanded_c,
float64x2x2_t expanded_alpha) {
float64x2_t ld = vld1q_f64(c);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
#else
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
#endif
ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real);
vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag));
}
static inline void pref_c_4(const double *c) {
__asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):);
}
static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) {
float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]),
vaddq_f64(ec1.val[1], ec2.val[1]) }};
return ret;
}
static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) {
float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }};
return ret;
}
static inline float64x2x2_t init() {
float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }};
return ret;
}
static inline void kernel_1x1(const double *sa, const double *sb, double *C,
BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K > 3; K -= 4) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b2);
c3 = update_ec(c3, a3, b3);
c4 = update_ec(c4, a4, b4);
}
c1 = add_ec(c1, c2);
c3 = add_ec(c3, c4);
c1 = add_ec(c1, c3);
for (; K; K--) {
c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2;
}
store_1c(C, c1, expanded_alpha);
}
static inline void kernel_2x1(const double *sa, const double *sb, double *C,
BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a3, b2);
c4 = update_ec(c4, a4, b2);
}
c1 = add_ec(c1, c3);
c2 = add_ec(c2, c4);
if (K) {
float64x2_t b1 = vld1q_f64(sb);
c1 = update_ec(c1, vld1q_f64(sa), b1);
c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha);
}
static inline void kernel_1x2(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K > 1; K -= 2) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a1, b2);
c3 = update_ec(c3, a2, b3);
c4 = update_ec(c4, a2, b4);
}
c1 = add_ec(c1, c3);
c2 = add_ec(c2, c4);
if (K) {
float64x2_t a1 = vld1q_f64(sa);
c1 = update_ec(c1, a1, vld1q_f64(sb));
c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
}
store_1c(C, c1, expanded_alpha);
store_1c(C + LDC * 2, c2, expanded_alpha);
}
static inline void kernel_2x2(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a1, b2);
c4 = update_ec(c4, a2, b2);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
store_1c(C, c3, expanded_alpha);
store_1c(C + 2, c4, expanded_alpha);
}
static inline void kernel_4x1(const double *sa, const double *sb, double *C,
BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
pref_c_4(C);
for (; K; K--) {
float64x2_t b1 = vld1q_f64(sb); sb += 2;
c1 = update_ec(c1, vld1q_f64(sa), b1);
c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
c3 = update_ec(c3, vld1q_f64(sa + 4), b1);
c4 = update_ec(c4, vld1q_f64(sa + 6), b1);
sa += 8;
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha);
store_1c(C + 4, c3, expanded_alpha);
store_1c(C + 6, c4, expanded_alpha);
}
static inline void kernel_4x2(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
pref_c_4(C);
pref_c_4(C + LDC * 2);
for (; K; K--) {
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a3, b1);
c4 = update_ec(c4, a4, b1);
c5 = update_ec(c5, a1, b2);
c6 = update_ec(c6, a2, b2);
c7 = update_ec(c7, a3, b2);
c8 = update_ec(c8, a4, b2);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha);
store_1c(C + 4, c3, expanded_alpha);
store_1c(C + 6, c4, expanded_alpha); C += LDC * 2;
store_1c(C, c5, expanded_alpha);
store_1c(C + 2, c6, expanded_alpha);
store_1c(C + 4, c7, expanded_alpha);
store_1c(C + 6, c8, expanded_alpha);
}
static inline void kernel_1x4(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4;
c1 = c2 = c3 = c4 = init();
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa); sa += 2;
c1 = update_ec(c1, a1, vld1q_f64(sb));
c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
c3 = update_ec(c3, a1, vld1q_f64(sb + 4));
c4 = update_ec(c4, a1, vld1q_f64(sb + 6));
sb += 8;
}
store_1c(C, c1, expanded_alpha); C += LDC * 2;
store_1c(C, c2, expanded_alpha); C += LDC * 2;
store_1c(C, c3, expanded_alpha); C += LDC * 2;
store_1c(C, c4, expanded_alpha);
}
static inline void kernel_2x4(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
for (; K; K--) {
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
c1 = update_ec(c1, a1, b1);
c2 = update_ec(c2, a2, b1);
c3 = update_ec(c3, a1, b2);
c4 = update_ec(c4, a2, b2);
c5 = update_ec(c5, a1, b3);
c6 = update_ec(c6, a2, b3);
c7 = update_ec(c7, a1, b4);
c8 = update_ec(c8, a2, b4);
}
store_1c(C, c1, expanded_alpha);
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
store_1c(C, c3, expanded_alpha);
store_1c(C + 2, c4, expanded_alpha); C += LDC * 2;
store_1c(C, c5, expanded_alpha);
store_1c(C + 2, c6, expanded_alpha); C += LDC * 2;
store_1c(C, c7, expanded_alpha);
store_1c(C + 2, c8, expanded_alpha);
}
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmla "
#define FMLA_II "fmls "
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define FMLA_RI "fmls "
#define FMLA_IR "fmla "
#define FMLA_II "fmla "
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define FMLA_RI "fmla "
#define FMLA_IR "fmls "
#define FMLA_II "fmla "
#else
#define FMLA_RI "fmls "
#define FMLA_IR "fmls "
#define FMLA_II "fmls "
#endif
#define FMLA_RR "fmla "
static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i,
float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) {
float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4);
up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar);
up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai);
lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar);
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai);
up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai);
up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar);
lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai);
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar);
vst2q_f64(C, up);
vst2q_f64(C + 4, lo);
}
static inline void kernel_4x4(const double *sa, const double *sb, double *C,
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
float64x2_t c1r, c1i, c2r, c2i;
float64x2_t c3r, c3i, c4r, c4i;
float64x2_t c5r, c5i, c6r, c6i;
float64x2_t c7r, c7i, c8r, c8i;
const double *pref_ = C;
pref_c_4(pref_); pref_ += LDC * 2;
pref_c_4(pref_); pref_ += LDC * 2;
pref_c_4(pref_); pref_ += LDC * 2;
pref_c_4(pref_);
__asm__ __volatile__(
"cmp %[K],#0\n\t"
"movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t"
"movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t"
"movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t"
"movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
"beq 4f; cmp %[K],#2\n\t"
"ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t"
"ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t"
"ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t"
"beq 2f; blt 3f\n\t"
"1:\n\t"
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
"fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t"
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t"
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
"fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t"
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t"
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
"fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t"
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t"
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
"fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t"
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t"
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
"fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t"
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t"
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
"fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t"
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t"
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
"fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t"
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t"
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
"fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t"
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t"
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t"
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t"
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t"
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t"
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t"
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t"
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t"
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t"
"2:\n\t"
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
"fmov v15.d[1],x0\n\t"
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t"
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t"
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t"
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t"
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t"
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t"
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t"
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t"
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t"
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t"
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t"
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t"
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t"
"3:\n\t"
"fmov v7.d[1],x0\n\t"
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t"
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t"
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t"
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t"
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t"
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t"
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t"
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t"
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t"
"4:\n\t"
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb)
::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2;
store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2;
store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2;
store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai);
}
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
BLASLONG n_left = N;
for (; n_left >= 4; n_left -= 4) {
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 4; m_left -= 4) {
kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai);
}
sb += 8 * K;
C += 8 * LDC;
}
if (n_left >= 2) {
n_left -= 2;
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 4; m_left -= 4) {
kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai);
}
sb += 4 * K;
C += 4 * LDC;
}
if (n_left) {
const FLOAT *a_ = sa;
FLOAT *c_ = C;
BLASLONG m_left = M;
for (; m_left >= 4; m_left -= 4) {
kernel_4x1(a_, sb, c_, K, alphar, alphai);
a_ += 8 * K;
c_ += 8;
}
if (m_left >= 2) {
m_left -= 2;
kernel_2x1(a_, sb, c_, K, alphar, alphai);
a_ += 4 * K;
c_ += 4;
}
if (m_left) {
kernel_1x1(a_, sb, c_, K, alphar, alphai);
}
}
return 0;
}

160
kernel/mips/KERNEL.generic Normal file
View File

@ -0,0 +1,160 @@
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Pure C for other kernels
SAMAXKERNEL = ../mips/amax.c
DAMAXKERNEL = ../mips/amax.c
CAMAXKERNEL = ../mips/zamax.c
ZAMAXKERNEL = ../mips/zamax.c
SAMINKERNEL = ../mips/amin.c
DAMINKERNEL = ../mips/amin.c
CAMINKERNEL = ../mips/zamin.c
ZAMINKERNEL = ../mips/zamin.c
SMAXKERNEL = ../mips/max.c
DMAXKERNEL = ../mips/max.c
SMINKERNEL = ../mips/min.c
DMINKERNEL = ../mips/min.c
ISAMAXKERNEL = ../mips/iamax.c
IDAMAXKERNEL = ../mips/iamax.c
ICAMAXKERNEL = ../mips/izamax.c
IZAMAXKERNEL = ../mips/izamax.c
ISAMINKERNEL = ../mips/iamin.c
IDAMINKERNEL = ../mips/iamin.c
ICAMINKERNEL = ../mips/izamin.c
IZAMINKERNEL = ../mips/izamin.c
ISMAXKERNEL = ../mips/imax.c
IDMAXKERNEL = ../mips/imax.c
ISMINKERNEL = ../mips/imin.c
IDMINKERNEL = ../mips/imin.c
SASUMKERNEL = ../mips/asum.c
DASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/zasum.c
ZASUMKERNEL = ../mips/zasum.c
SSUMKERNEL = ../mips/sum.c
DSUMKERNEL = ../mips/sum.c
CSUMKERNEL = ../mips/zsum.c
ZSUMKERNEL = ../mips/zsum.c
SAXPYKERNEL = ../mips/axpy.c
DAXPYKERNEL = ../mips/axpy.c
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
SCOPYKERNEL = ../mips/copy.c
DCOPYKERNEL = ../mips/copy.c
CCOPYKERNEL = ../mips/zcopy.c
ZCOPYKERNEL = ../mips/zcopy.c
SDOTKERNEL = ../mips/dot.c
DDOTKERNEL = ../mips/dot.c
CDOTKERNEL = ../mips/zdot.c
ZDOTKERNEL = ../mips/zdot.c
SNRM2KERNEL = ../mips/nrm2.c
DNRM2KERNEL = ../mips/nrm2.c
CNRM2KERNEL = ../mips/znrm2.c
ZNRM2KERNEL = ../mips/znrm2.c
SROTKERNEL = ../mips/rot.c
DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
SSCALKERNEL = ../mips/scal.c
DSCALKERNEL = ../mips/scal.c
CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
SSWAPKERNEL = ../mips/swap.c
DSWAPKERNEL = ../mips/swap.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
SGEMVNKERNEL = ../mips/gemv_n.c
DGEMVNKERNEL = ../mips/gemv_n.c
CGEMVNKERNEL = ../mips/zgemv_n.c
ZGEMVNKERNEL = ../mips/zgemv_n.c
SGEMVTKERNEL = ../mips/gemv_t.c
DGEMVTKERNEL = ../mips/gemv_t.c
CGEMVTKERNEL = ../mips/zgemv_t.c
ZGEMVTKERNEL = ../mips/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

View File

@ -1,7 +1,6 @@
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifeq ($(HAVE_GAS), 1)
include $(KERNELDIR)/KERNEL.POWER8 include $(KERNELDIR)/KERNEL.POWER8
else else
#SGEMM_BETA = ../generic/gemm_beta.c #SGEMM_BETA = ../generic/gemm_beta.c
#DGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c
#CGEMM_BETA = ../generic/zgemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c
@ -33,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c
SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c
SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c
SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c
SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c
SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c
SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c
SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c
DGEMMKERNEL = dgemm_kernel_power10.c DGEMMKERNEL = dgemm_kernel_power10.c
DGEMMINCOPY = DGEMMINCOPY =
DGEMMITCOPY = DGEMMITCOPY =
@ -43,7 +52,18 @@ DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c
DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c
DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c
DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c
DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c
DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
CGEMMKERNEL = cgemm_kernel_power10.S CGEMMKERNEL = cgemm_kernel_power10.S
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
@ -218,5 +238,4 @@ QCABS_KERNEL = ../generic/cabs.c
#Dump kernel #Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
endif endif

View File

@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
#endif #endif
const float *mvecp = mvec; const float *mvecp = mvec;
/* We have to load reverse mask for big endian. */ /* We have to load reverse mask for big endian. */
/* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
long ytmp; long ytmp;
__asm__ __asm__
@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%4) \n\t"
"stxv 49, 16(%4) \n\t"
"stxv 50, 32(%4) \n\t"
"stxv 51, 48(%4) \n\t"
"stxv 34, 64(%4) \n\t"
"stxv 35, 80(%4) \n\t"
"stxv 38, 96(%4) \n\t"
"stxv 39, 112(%4) \n\t"
#else
"stxv 49, 0(%4) \n\t" "stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t" "stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t" "stxv 51, 32(%4) \n\t"
@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"stxv 34, 80(%4) \n\t" "stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t" "stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t" "stxv 38, 112(%4) \n\t"
#endif
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part "xxperm 52, 40, %x10 \n\t" // exchange real and imag part
@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%4) \n\t"
"stxv 49, 16(%4) \n\t"
"stxv 50, 32(%4) \n\t"
"stxv 51, 48(%4) \n\t"
"stxv 34, 64(%4) \n\t"
"stxv 35, 80(%4) \n\t"
"stxv 38, 96(%4) \n\t"
"stxv 39, 112(%4) \n\t"
#else
"stxv 49, 0(%4) \n\t" "stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t" "stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t" "stxv 51, 32(%4) \n\t"
@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"stxv 34, 80(%4) \n\t" "stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t" "stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t" "stxv 38, 112(%4) \n\t"
#endif
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
: :

View File

@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
".align 5 \n" ".align 5 \n"
"one%=: \n\t" "one%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t" "stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t" "stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t" "stxv 35, 32(%3) \n\t"
@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 36, 80(%3) \n\t" "stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t" "stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t" "stxv 38, 112(%3) \n\t"
#endif
"lxvp 32, 0(%2) \n\t" "lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t" "lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t" "lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t" "lxvp 38, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 41, 128(%3) \n\t" "stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t" "stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t" "stxv 43, 160(%3) \n\t"
@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 44, 208(%3) \n\t" "stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t" "stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t" "stxv 46, 240(%3) \n\t"
#endif
"lxvp 40, 128(%2) \n\t" "lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t" "lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t" "lxvp 44, 192(%2) \n\t"
@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"bgt one%= \n" "bgt one%= \n"
"two%=: \n\t" "two%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t" "stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t" "stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t" "stxv 35, 32(%3) \n\t"
@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 44, 208(%3) \n\t" "stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t" "stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t" "stxv 46, 240(%3) \n\t"
#endif
"#n=%1 x=%4=%2 y=%0=%3" "#n=%1 x=%4=%2 y=%0=%3"
: :
"=m" (*y), "=m" (*y),

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#include "common.h" #include "common.h"
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
#include "cdot_microk_power10.c" #include "cdot_microk_power10.c"
#else #else
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
#else #else
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;

View File

@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void cdot_kernel_8 (long n, float *x, float *y, float *dot) static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
{ {
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
__asm__ __asm__
( (
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"
@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
"xxswapd 33, 34 \n\t" "xxswapd 33, 34 \n\t"
"xvaddsp 35, 35, 32 \n\t" "xvaddsp 35, 35, 32 \n\t"
"xvaddsp 34, 34, 33 \n\t" "xvaddsp 34, 34, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xxpermdi 34, 35, 34, 0 \n\t"
#else
"xxpermdi 34, 34, 35, 2 \n\t" "xxpermdi 34, 34, 35, 2 \n\t"
#endif
"stxv 34, 0(%6) \n\t" "stxv 34, 0(%6) \n\t"
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"

View File

@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cgemm_macros_power10.S" #include "cgemm_macros_power10.S"
#if (_AIX)
.set perm_const1, 0x0405060700010203
.set perm_const2, 0x0c0d0e0f08090a0b
.set save_permute_12, 0x1011121300010203
.set save_permute_11, 0x18191a1b08090a0b
#else
.equ perm_const1, 0x0405060700010203 .equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b .equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f .equ save_permute_12, 0x0c0d0e0f1c1d1e1f
.equ save_permute_11, 0x0405060714151617 .equ save_permute_11, 0x0405060714151617
#endif
#ifndef NEEDPARAM #ifndef NEEDPARAM
@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*load reverse permute mask for big endian /*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203 uint128 = 0xc0d0e0f08090a0b0405060700010203
*/ */
#if (_AIX)
lis T2, (perm_const2>>48 & 0xFFFF)
lis T1, (perm_const1>>48 & 0xFFFF)
lis T3, (save_permute_12>>48 & 0xFFFF)
lis T4, (save_permute_11>>48 & 0xFFFF)
ori T2, T2, (perm_const2>>32 & 0xFFFF)
ori T1, T1, (perm_const1>>32 & 0xFFFF)
ori T3, T3, (save_permute_12>>32 & 0xFFFF)
ori T4, T4, (save_permute_11>>32 & 0xFFFF)
#else
lis T2, perm_const2@highest lis T2, perm_const2@highest
lis T1, perm_const1@highest lis T1, perm_const1@highest
lis T3, save_permute_12@highest lis T3, save_permute_12@highest
lis T4, save_permute_11@highest lis T4, save_permute_11@highest
ori T2, T2, perm_const2@higher ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher ori T4, T4, save_permute_11@higher
#endif
rldicr T2, T2, 32, 31 rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31 rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31 rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31 rldicr T4, T4, 32, 31
#if (_AIX)
oris T2, T2, (perm_const2>>16 & 0xFFFF)
oris T1, T1, (perm_const1>>16 & 0xFFFF)
oris T3, T3, (save_permute_12>>16 & 0xFFFF)
oris T4, T4, (save_permute_11>>16 & 0xFFFF)
ori T2, T2, (perm_const2 & 0xFFFF)
ori T1, T1, (perm_const1 & 0xFFFF)
ori T3, T3, (save_permute_12 & 0xFFFF)
ori T4, T4, (save_permute_11 & 0xFFFF)
#else
oris T2, T2, perm_const2@h oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h oris T3, T3, save_permute_12@h
@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ori T1, T1, perm_const1@l ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l ori T4, T4, save_permute_11@l
#endif
li r0,0 li r0,0
li PRE,512 li PRE,512

View File

@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0 .if \OffsetA != 0
addi \AREG, \AREG, \OffsetA addi \AREG, \AREG, \OffsetA
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 36, 34
xvf32gerpp 2, 37, 34
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 7, 36, 35
xvf32gerpp 6, 37, 35
xvf32gerpp 5, 32, 35
xvf32gerpp 4, 33, 35
#else
xvf32gerpp 3, 36, 35 xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35 xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35 xvf32gerpp 1, 32, 35
@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 37, 34 xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34 xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34 xvf32gerpp 4, 33, 34
#endif
.endm .endm
.macro LOAD4x8_2 .macro LOAD4x8_2
@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xvf32gerpp 3, 36, 34
xvf32gerpp 2, 37, 34
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 7, 36, 35
xvf32gerpp 6, 37, 35
xvf32gerpp 5, 32, 35
xvf32gerpp 4, 33, 35
#else
xvf32gerpp 3, 36, 35 xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35 xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35 xvf32gerpp 1, 32, 35
@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 37, 34 xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34 xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34 xvf32gerpp 4, 33, 34
#endif
.if \Complete==0 .if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif .endif
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xvf32gerpp 3, 42, 38
xvf32gerpp 2, 43, 38
xvf32gerpp 1, 40, 38
xvf32gerpp 0, 41, 38
xvf32gerpp 7, 42, 39
xvf32gerpp 6, 43, 39
xvf32gerpp 5, 40, 39
xvf32gerpp 4, 41, 39
#else
xvf32gerpp 3, 42, 39 xvf32gerpp 3, 42, 39
xvf32gerpp 2, 43, 39 xvf32gerpp 2, 43, 39
xvf32gerpp 1, 40, 39 xvf32gerpp 1, 40, 39
@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 43, 38 xvf32gerpp 6, 43, 38
xvf32gerpp 5, 40, 38 xvf32gerpp 5, 40, 38
xvf32gerpp 4, 41, 38 xvf32gerpp 4, 41, 38
#endif
.if \Complete==0 .if \Complete==0
lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2 RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2 xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2 xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2 xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2 xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2 xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2 xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3 xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1 xvaddsp vs25, vs25, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2 xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2 xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs26, vs26, vs7 xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5 xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11 xvaddsp vs28, vs28, vs11
xvaddsp vs29, vs29, vs9 xvaddsp vs29, vs29, vs9
xvaddsp vs30, vs30, vs15 xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13 xvaddsp vs31, vs31, vs13
#else
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs4, vs12, 1
xxpermdi vs26, vs6, vs14, 1
xxpermdi vs29, vs8, vs0, 1
xxpermdi vs28, vs10, vs2, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else #else
xxpermdi vs25, vs8, vs0, 2 xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2 xxpermdi vs24, vs10, vs2, 2
@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs2, vs10, 2 xxpermdi vs28, vs2, vs10, 2
xxpermdi vs31, vs4, vs12, 2 xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2 xxpermdi vs30, vs6, vs14, 2
#endif
#endif #endif
stxvp vs24, 0(CO) stxvp vs24, 0(CO)
MULT_APLHA_PART1 vs48, vs56, vs0, vs1 MULT_APLHA_PART1 vs48, vs56, vs0, vs1
@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2 RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2 xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2 xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2 xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2 xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2 xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2 xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs32, vs32, vs3 xvaddsp vs32, vs32, vs3
xvaddsp vs33, vs33, vs1 xvaddsp vs33, vs33, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2 xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2 xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs40, vs40, vs7 xvaddsp vs40, vs40, vs7
xvaddsp vs41, vs41, vs5 xvaddsp vs41, vs41, vs5
xvaddsp vs34, vs34, vs11 xvaddsp vs34, vs34, vs11
xvaddsp vs35, vs35, vs9 xvaddsp vs35, vs35, vs9
xvaddsp vs42, vs42, vs15 xvaddsp vs42, vs42, vs15
xvaddsp vs43, vs43, vs13 xvaddsp vs43, vs43, vs13
#else
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xxpermdi vs33, vs0, vs8, 1
xxpermdi vs32, vs2, vs10, 1
xxpermdi vs41, vs4, vs12, 1
xxpermdi vs40, vs6, vs14, 1
xxpermdi vs35, vs8, vs0, 1
xxpermdi vs34, vs10, vs2, 1
xxpermdi vs43, vs12, vs4, 1
xxpermdi vs42, vs14, vs6, 1
#else #else
xxpermdi vs33, vs8, vs0, 2 xxpermdi vs33, vs8, vs0, 2
xxpermdi vs32, vs10, vs2, 2 xxpermdi vs32, vs10, vs2, 2
@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs34, vs2, vs10, 2 xxpermdi vs34, vs2, vs10, 2
xxpermdi vs43, vs4, vs12, 2 xxpermdi vs43, vs4, vs12, 2
xxpermdi vs42, vs6, vs14, 2 xxpermdi vs42, vs6, vs14, 2
#endif
#endif #endif
stxvp vs32, 0(T2) stxvp vs32, 0(T2)
stxvp vs40, 32(T2) stxvp vs40, 32(T2)
@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0 .if \OffsetA != 0
addi \AREG, \AREG, \OffsetA addi \AREG, \AREG, \OffsetA
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 32, 35
xvf32gerpp 2, 33, 35
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
#else
xvf32gerpp 3, 32, 34 xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34 xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35 xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35 xvf32gerpp 0, 33, 35
#endif
.endm .endm
.macro LOAD4x4_2 .macro LOAD4x4_2
@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 32, 35
xvf32gerpp 2, 33, 35
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
#else
xvf32gerpp 3, 32, 34 xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34 xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35 xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35 xvf32gerpp 0, 33, 35
#endif
.if \Complete==0 .if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 36, 39
xvf32gerpp 2, 37, 39
xvf32gerpp 1, 36, 38
xvf32gerpp 0, 37, 38
#else
xvf32gerpp 3, 36, 38 xvf32gerpp 3, 36, 38
xvf32gerpp 2, 37, 38 xvf32gerpp 2, 37, 38
xvf32gerpp 1, 36, 39 xvf32gerpp 1, 36, 39
xvf32gerpp 0, 37, 39 xvf32gerpp 0, 37, 39
#endif
.if \Complete==0 .if \Complete==0
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2 RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs1, vs8, vs0, 2 xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2 xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2 xxpermdi vs9, vs0, vs8, 2
@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs7, vs14, vs6, 2 xxpermdi vs7, vs14, vs6, 2
xxpermdi vs13, vs4, vs12, 2 xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2 xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs24, vs24, vs3 xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1 xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11 xvaddsp vs26, vs26, vs11
@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvaddsp vs29, vs29, vs5 xvaddsp vs29, vs29, vs5
xvaddsp vs30, vs30, vs15 xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13 xvaddsp vs31, vs31, vs13
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs8, vs0, 1
xxpermdi vs26, vs10, vs2, 1
xxpermdi vs29, vs4, vs12, 1
xxpermdi vs28, vs6, vs14, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else #else
xxpermdi vs25, vs8, vs0, 2 xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2 xxpermdi vs24, vs10, vs2, 2
@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs14, vs6, 2 xxpermdi vs28, vs14, vs6, 2
xxpermdi vs31, vs4, vs12, 2 xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2 xxpermdi vs30, vs6, vs14, 2
#endif
#endif #endif
stxvp vs24, 0(CO) stxvp vs24, 0(CO)
stxvp vs26, 0(T1) stxvp vs26, 0(T1)
@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0 .if \OffsetA != 0
addi \AREG, \AREG, \OffsetA addi \AREG, \AREG, \OffsetA
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 35, 32
xvf32gerpp 0, 34, 32
#else
xvf32gerpp 1, 34, 32 xvf32gerpp 1, 34, 32
xvf32gerpp 0, 35, 32 xvf32gerpp 0, 35, 32
#endif
.endm .endm
.macro LOAD4x2_2 .macro LOAD4x2_2
@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 35, 32
xvf32gerpp 0, 34, 32
#else
xvf32gerpp 1, 34, 33 xvf32gerpp 1, 34, 33
xvf32gerpp 0, 35, 33 xvf32gerpp 0, 35, 33
#endif
.if \Complete==0 .if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 37, 33
xvf32gerpp 0, 36, 33
#else
xvf32gerpp 1, 36, 32 xvf32gerpp 1, 36, 32
xvf32gerpp 0, 37, 32 xvf32gerpp 0, 37, 32
#endif
.if \Complete==0 .if \Complete==0
lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR1 RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 0
xxpermdi vs9, vs2, vs10, 0
xxpermdi vs3, vs8, vs0, 3
xxpermdi vs11, vs10, vs2, 3
#else
xxpermdi vs1, vs8, vs0, 0 xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs10, vs2, 0 xxpermdi vs9, vs10, vs2, 0
xxpermdi vs3, vs0, vs8, 3 xxpermdi vs3, vs0, vs8, 3
xxpermdi vs11, vs2, vs10, 3 xxpermdi vs11, vs2, vs10, 3
#endif
xvaddsp vs24, vs24, vs1 xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9 xvaddsp vs26, vs26, vs9
xvaddsp vs25, vs25, vs3 xvaddsp vs25, vs25, vs3
xvaddsp vs27, vs27, vs11 xvaddsp vs27, vs27, vs11
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs24, vs0, vs8, 0
xxpermdi vs26, vs2, vs10, 0
xxpermdi vs25, vs8, vs0, 3
xxpermdi vs27, vs10, vs2, 3
#else #else
xxpermdi vs24, vs8, vs0, 0 xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs10, vs2, 0 xxpermdi vs26, vs10, vs2, 0
xxpermdi vs25, vs0, vs8, 3 xxpermdi vs25, vs0, vs8, 3
xxpermdi vs27, vs2, vs10, 3 xxpermdi vs27, vs2, vs10, 3
#endif
#endif #endif
stxv vs24, 0(CO) stxv vs24, 0(CO)
stxv vs25, 0(T1) stxv vs25, 0(T1)
@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0 .if \OffsetA != 0
addi \AREG, \AREG, \OffsetA addi \AREG, \AREG, \OffsetA
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 34, 32
xvf32gerpp 1, 35, 32
#else
xvf32gerpp 0, 35, 32 xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32 xvf32gerpp 1, 34, 32
#endif
.endm .endm
.macro LOAD4x1_2 .macro LOAD4x1_2
@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD4x1_2O OffsetA, OffsetB .macro LOAD4x1_2O OffsetA, OffsetB
lxv vs32, (\OffsetA)(AO) lxv vs32, (\OffsetA)(AO)
vspltisb v6, 0 vspltisb v6, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs33, vs32, vs38, 2
xxpermdi vs32, vs32, vs38, 0
#else
xxpermdi vs33, vs32, vs38, 0 xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2 xxpermdi vs32, vs32, vs38, 2
#endif
lxvp vs34, (0+\OffsetB)(BO) lxvp vs34, (0+\OffsetB)(BO)
lxvp vs36, (32+\OffsetB)(BO) lxvp vs36, (32+\OffsetB)(BO)
.endm .endm
@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 34, 32
xvf32gerpp 1, 35, 32
#else
xvf32gerpp 0, 35, 32 xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32 xvf32gerpp 1, 34, 32
#endif
.if \Complete==0 .if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 36, 33
xvf32gerpp 1, 37, 33
#else
xvf32gerpp 0, 37, 33 xvf32gerpp 0, 37, 33
xvf32gerpp 1, 36, 33 xvf32gerpp 1, 36, 33
#endif
.if \Complete==0 .if \Complete==0
lxv vs32, DISP2(\Index, \OffsetA)(\AREG) lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs33, vs32, vs38, 2
xxpermdi vs32, vs32, vs38, 0
#else
xxpermdi vs33, vs32, vs38, 0 xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2 xxpermdi vs32, vs32, vs38, 2
#endif
.endif .endif
.if \IsLast==1 .if \IsLast==1
.if \Complete==1 .if \Complete==1
@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 2, 37, 34
xvf32gerpp 3, 36, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 1, 32, 34
#else
xvf32gerpp 2, 37, 35 xvf32gerpp 2, 37, 35
xvf32gerpp 3, 36, 35 xvf32gerpp 3, 36, 35
xvf32gerpp 0, 33, 35 xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35 xvf32gerpp 1, 32, 35
#endif
.if \Complete==0 .if \Complete==0
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 2, 41, 35
xvf32gerpp 3, 40, 35
xvf32gerpp 0, 39, 35
xvf32gerpp 1, 38, 35
#else
xvf32gerpp 2, 41, 34 xvf32gerpp 2, 41, 34
xvf32gerpp 3, 40, 34 xvf32gerpp 3, 40, 34
xvf32gerpp 0, 39, 34 xvf32gerpp 0, 39, 34
xvf32gerpp 1, 38, 34 xvf32gerpp 1, 38, 34
#endif
.if \Complete==0 .if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
@ -1068,22 +1262,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2 RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2 xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2 xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2 xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2 xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2 xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2 xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3 xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1 xvaddsp vs25, vs25, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2 xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2 xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs26, vs26, vs7 xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5 xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11 xvaddsp vs28, vs28, vs11
xvaddsp vs29, vs29, vs9 xvaddsp vs29, vs29, vs9
xvaddsp vs30, vs30, vs15 xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13 xvaddsp vs31, vs31, vs13
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs4, vs12, 1
xxpermdi vs26, vs6, vs14, 1
xxpermdi vs29, vs8, vs0, 1
xxpermdi vs28, vs10, vs2, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else #else
xxpermdi vs25, vs8, vs0, 2 xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2 xxpermdi vs24, vs10, vs2, 2
@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs2, vs10, 2 xxpermdi vs28, vs2, vs10, 2
xxpermdi vs31, vs4, vs12, 2 xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2 xxpermdi vs30, vs6, vs14, 2
#endif
#endif #endif
stxvp vs24, 0(CO) stxvp vs24, 0(CO)
stxvp vs26, 32(CO) stxvp vs26, 32(CO)
@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 33, 34
xvf32gerpp 1, 32, 34
#else
xvf32gerpp 0, 33, 35 xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35 xvf32gerpp 1, 32, 35
#endif
.if \Complete==0 .if \Complete==0
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif .endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 37, 35
xvf32gerpp 1, 36, 35
#else
xvf32gerpp 0, 37, 34 xvf32gerpp 0, 37, 34
xvf32gerpp 1, 36, 34 xvf32gerpp 1, 36, 34
#endif
.if \Complete==0 .if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR1 RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2 xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2 xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2 xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2 xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3 xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1 xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11 xvaddsp vs26, vs26, vs11
xvaddsp vs27, vs27, vs9 xvaddsp vs27, vs27, vs9
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs8, vs0, 1
xxpermdi vs26, vs10, vs2, 1
#else #else
xxpermdi vs25, vs8, vs0, 2 xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2 xxpermdi vs24, vs10, vs2, 2
xxpermdi vs27, vs0, vs8, 2 xxpermdi vs27, vs0, vs8, 2
xxpermdi vs26, vs2, vs10, 2 xxpermdi vs26, vs2, vs10, 2
#endif
#endif #endif
stxvp vs24, 0(CO) stxvp vs24, 0(CO)
stxvp vs26, 0(T1) stxvp vs26, 0(T1)
@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxperm vs8, vs9, save_permute_1 xxperm vs8, vs9, save_permute_1
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 0
xxpermdi vs9, vs8, vs0, 3
#else
xxpermdi vs1, vs8, vs0, 0 xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs0, vs8, 3 xxpermdi vs9, vs0, vs8, 3
#endif
xvaddsp vs24, vs24, vs1 xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9 xvaddsp vs26, vs26, vs9
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs24, vs0, vs8, 0
xxpermdi vs26, vs8, vs0, 3
#else #else
xxpermdi vs24, vs8, vs0, 0 xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs0, vs8, 3 xxpermdi vs26, vs0, vs8, 3
#endif
#endif #endif
stxv vs24, 0(CO) stxv vs24, 0(CO)
stxv vs26, 0(T1) stxv vs26, 0(T1)
@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs32, (0+\OffsetA)(AO) lxvp vs32, (0+\OffsetA)(AO)
lxvp vs36, (32+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO)
vspltisb v10, 0 vspltisb v10, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs42, 2
xxpermdi vs34, vs34, vs42, 0
#else
xxpermdi vs35, vs34, vs42, 0 xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2 xxpermdi vs34, vs34, vs42, 2
#endif
lxvp vs38, (64+\OffsetA)(AO) lxvp vs38, (64+\OffsetA)(AO)
lxvp vs40, (64+32+\OffsetA)(AO) lxvp vs40, (64+32+\OffsetA)(AO)
.endm .endm
@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 3, 35, 40 xvf32gerpp 3, 35, 40
.if \Complete==0 .if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG) lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs42, 2
xxpermdi vs34, vs34, vs42, 0
#else
xxpermdi vs35, vs34, vs42, 0 xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2 xxpermdi vs34, vs34, vs42, 2
#endif
lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
.endif .endif
.if \IsLast==1 .if \IsLast==1
@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs34, vs42, vs4, vs5
MULT_APLHA_PART2 vs35, vs43, vs6, vs7 MULT_APLHA_PART2 vs35, vs43, vs6, vs7
/* reconstruct r, i pairs*/ /* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
xxperm vs2, vs3, save_permute_1
xxperm vs4, vs5, save_permute_1
xxperm vs6, vs7, save_permute_1
#else
xxperm vs0, vs1, vs28 xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28 xxperm vs2, vs3, vs28
xxperm vs4, vs5, vs28 xxperm vs4, vs5, vs28
xxperm vs6, vs7, vs28 xxperm vs6, vs7, vs28
#endif
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
xvaddsp vs24, vs24, vs2 xvaddsp vs24, vs24, vs2
@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvp vs26, 32(CO) stxvp vs26, 32(CO)
#else #else
/* reconstruct r, i pairs*/ /* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
stxv vs2, 0(CO)
stxv vs0, 16(CO)
stxv vs6, 32(CO)
stxv vs4, 48(CO)
#else
stxv vs0, 0(CO) stxv vs0, 0(CO)
stxv vs2, 16(CO) stxv vs2, 16(CO)
stxv vs4, 32(CO) stxv vs4, 32(CO)
stxv vs6, 48(CO) stxv vs6, 48(CO)
#endif
#endif #endif
addi CO, CO, 64 addi CO, CO, 64
.endm .endm
@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxv vs34, (\OffsetB)(BO) lxv vs34, (\OffsetB)(BO)
lxvp vs32, (0+\OffsetA)(AO) lxvp vs32, (0+\OffsetA)(AO)
vspltisb v6, 0 vspltisb v6, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs38, 2
xxpermdi vs34, vs34, vs38, 0
#else
xxpermdi vs35, vs34, vs38, 0 xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2 xxpermdi vs34, vs34, vs38, 2
#endif
lxvp vs36, (32+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO)
.endm .endm
@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 1, 35, 36 xvf32gerpp 1, 35, 36
.if \Complete==0 .if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG) lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs38, 2
xxpermdi vs34, vs34, vs38, 0
#else
xxpermdi vs35, vs34, vs38, 0 xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2 xxpermdi vs34, vs34, vs38, 2
#endif
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
.endif .endif
.if \IsLast==1 .if \IsLast==1
@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs33, vs41, vs2, vs3 MULT_APLHA_PART2 vs33, vs41, vs2, vs3
/* reconstruct r, i pairs*/ /* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
xxperm vs2, vs3, save_permute_1
#else
xxperm vs0, vs1, vs28 xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28 xxperm vs2, vs3, vs28
#endif
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
xvaddsp vs24, vs24, vs2 xvaddsp vs24, vs24, vs2
@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvp vs24, 0(CO) stxvp vs24, 0(CO)
#else #else
/* reconstruct r, i pairs*/ /* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
stxv vs2, 0(CO)
stxv vs0, 16(CO)
#else
stxv vs0, 0(CO) stxv vs0, 0(CO)
stxv vs2, 16(CO) stxv vs2, 16(CO)
#endif
#endif #endif
addi CO, CO, 32 addi CO, CO, 32
.endm .endm
@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
/* reconstruct r, i pairs*/ /* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
#else
xxperm vs0, vs1, vs28 xxperm vs0, vs1, vs28
#endif
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
xvaddsp vs24, vs24, vs0 xvaddsp vs24, vs24, vs0
@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART1 vs32, vs40, vs37, vs1 MULT_APLHA_PART1 vs32, vs40, vs37, vs1
MULT_APLHA_PART2 vs32, vs40, vs37, vs1 MULT_APLHA_PART2 vs32, vs40, vs37, vs1
/* reconstruct r, i pairs*/ /* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs37, vs1, save_permute_1
#else
xxperm vs37, vs1, vs28 xxperm vs37, vs1, vs28
#endif
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
/* add */ /* add */
xvaddsp vs36, vs36, vs37 xvaddsp vs36, vs36, vs37

View File

@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
{ {
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
__asm__ __asm__
( (
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"

View File

@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "cswap_microk_power8.c" #include "cswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "cswap_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "cswap_microk_power8.c" #include "cswap_microk_power10.c"
#endif #endif
#endif #endif

View File

@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "dasum_microk_power8.c" #include "dasum_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "dasum_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "dasum_microk_power8.c" #include "dasum_microk_power10.c"
#endif #endif
#endif #endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 ) if ( inc_x == 1 )
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 32) if ( n >= 32)
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -0,0 +1,923 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
result[2] = vec_mul(result[2], valpha); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
result[3] = vec_mul(result[3], valpha); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#if (defined(__GNUC__) && (__GNUC__ == 10))
#if defined(_AIX)
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
#endif
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
#endif
#define LOAD_A_1x8(K, M) \
ra0 = vec_xl(0, A+((K)*lda)+M+0); \
ra1 = vec_xl(0, A+((K)*lda)+M+2); \
ra2 = vec_xl(0, A+((K)*lda)+M+4); \
ra3 = vec_xl(0, A+((K)*lda)+M+6);
#define LOAD_A_1x4(K, M) \
ra0 = vec_xl(0, A+((K)*lda)+M+0); \
ra1 = vec_xl(0, A+((K)*lda)+M+2); \
#define LOAD_A_1x2(K, M) \
ra0 = vec_xl(0, A+((K)*lda)+M+0);
#define LOAD_A_1x1(K, M) \
ra0 = vec_splats(A[((K)*lda)+M+0]);
#define LOAD_BTP_8x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb2, t0, t1); \
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
t0 = vec_mergeh(rb4, rb5); \
t1 = vec_mergeh(rb6, rb7); \
LOAD_PAIR(pb1, t0, t1); \
t0 = vec_mergel(rb4, rb5); \
t1 = vec_mergel(rb6, rb7); \
LOAD_PAIR(pb3, t0, t1);
#define LOAD_BTP_8x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1); \
rb2 = vec_xor(rb2, rb2); \
rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \
rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \
rb3 = vec_xor(rb3, rb3); \
rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \
rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \
LOAD_PAIR(pb1, rb2, rb3);
#define LOAD_BTP_4x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb1, t0, t1);
#define LOAD_BTP_4x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1);
#define LOAD_BTP_2x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \
t1 = vec_mergel(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
#define LOAD_BTP_2x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
#define LOAD_B_1x1(N, K) \
rb0 = vec_splats(B[((N)*ldb)+K]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#define PACK_B(pb0, pb1, offset) \
*((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \
*((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;
#define LOAD_PACKED_B(pb0, pb1, offset) \
pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \
pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset)));
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
BLASLONG k2 = K & ~1;
#if defined(__GNUC__) && !defined(__clang__)
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
#else
int has_packing = 0;
#endif
double *packB;
if (has_packing) packB = (double *)malloc(K*8*sizeof(double));
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (n = 0; n < n8; n += 8) {
for (m = 0; m < m8; m += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (has_packing) {
if (m == 0) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_B(pb0, pb1, 0);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_B(pb2, pb3, 8);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_B(pb0, pb1, 0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
LOAD_A_1x8(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc4, n+0, m+4);
SAVE_4x2_ACC(&acc6, n+0, m+6);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
SAVE_4x2_ACC(&acc5, n+4, m+4);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; m < m4; m += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
LOAD_A_1x4(k+1, m);
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
LOAD_A_1x4(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; m < m2; m += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x2(k+1, m);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x2(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; m < M; m++) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x1(k+1, m);
LOAD_PACKED_B(pb2, pb3, 8);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_PACKED_B(pb0, pb1, 0);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n+0, m+0);
SAVE_4x1_ACC(&acc1, n+4, m+0);
}
}
for (; n < n4; n += 4) {
for (m = 0; m < m8; m += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; m < m4; m += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
LOAD_A_1x4(k+1, m);
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; m < m2; m += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x2(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; m < M; m++) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n, m);
}
}
for (; n < n2; n += 2) {
for (m = 0; m < m8; m += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x8(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
LOAD_A_1x8(k+1, m);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
}
for (; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; m < m4; m += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x4(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
LOAD_A_1x4(k+1, m);
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
}
for (; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; m < m2; m += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x2(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x2(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
}
for (; m < M; m++) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x1_ACC(&acc0, n+0, m+0);
}
}
for (; n < N; n++) {
for (m = 0; m < m8; m += 8) {
vector double result = ((vector double){0.,0.});
vector double result1 = ((vector double){0.,0.});
vector double result2 = ((vector double){0.,0.});
vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x1(n, k);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
for (; m < m4; m += 4) {
vector double result = ((vector double){0.,0.});
vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x1(n, k);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
for (; m < m2; m += 2) {
vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x1(n, k);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
for (; m < M; m++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[m+k*lda] * B[n*ldb+k];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
if (has_packing) free(packB);
return 0;
}

View File

@ -0,0 +1,581 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1];
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#define LOAD_A_1x8(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+2); \
ra2 = vec_xl(0, A+(K*lda)+M+4); \
ra3 = vec_xl(0, A+(K*lda)+M+6);
#define LOAD_A_1x4(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+2);
#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);
#define LOAD_BP_1x8(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
#define LOAD_BP_1x4(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+(K*ldb)+N); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
#define LOAD_B_1x8(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2); \
rb2 = vec_xl(0, B+(K*ldb)+N+4); \
rb3 = vec_xl(0, B+(K*ldb)+N+6); \
#define LOAD_B_1x4(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2);
#define LOAD_B_1x2(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0);
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (m = 0; m < m8; m += 8) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3;
__vector_pair pb0, pb1;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc4, n+0, m+4);
SAVE_4x2_ACC(&acc6, n+0, m+6);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
SAVE_4x2_ACC(&acc5, n+4, m+4);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1;
__vector_pair pb0, pb1;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BP_1x8(k, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BP_1x4(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double t0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_BP_1x2(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0;
__vector_pair pb0, pb1;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BP_1x8(k, n);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BP_1x4(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double t0;
__vector_pair pb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_BP_1x2(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n, m);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
}
for (; m < M; m++) {
for (n = 0; n < n8; n += 8) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x8(k, n);
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
SAVE_4x1_VSR(result2, n+4, m);
SAVE_4x1_VSR(result3, n+6, m);
}
for (; n < n4; n += 4) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x4(k, n);
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
}
for (; n < n2; n += 2) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_4x1_VSR(result, n, m);
}
for (; n < N; n++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[k*lda+m] * B[k*ldb+n];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,882 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
result[2] = vec_mul(result[2], valpha); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
result[3] = vec_mul(result[3], valpha); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_2x1_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#if (defined(__GNUC__) && (__GNUC__ == 10))
#if defined(_AIX)
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
#endif
#else
#define LOAD_PAIR(pair, v0, v1) \
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
#endif
#define LOAD_AT_8x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergel(ra0, ra1); \
ra0 = t0; \
ra1 = t1; \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra2, ra3); \
t1 = vec_mergel(ra2, ra3); \
ra2 = t0; \
ra3 = t1; \
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
t0 = vec_mergeh(ra4, ra5); \
t1 = vec_mergel(ra4, ra5); \
ra4 = t0; \
ra5 = t1; \
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
t0 = vec_mergeh(ra6, ra7); \
t1 = vec_mergel(ra6, ra7); \
ra6 = t0; \
ra7 = t1;
#define LOAD_AT_8x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
ra2 = vec_xor(ra2, ra2); \
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
ra3 = vec_xor(ra3, ra3); \
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
#define LOAD_AT_4x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergeh(ra2, ra3); \
t2 = vec_mergel(ra0, ra1); \
t3 = vec_mergel(ra2, ra3); \
ra0 = t0; \
ra1 = t2; \
ra2 = t1; \
ra3 = t3;
#define LOAD_AT_4x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
#define LOAD_AT_2x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergel(ra0, ra1); \
ra0 = t0; \
ra1 = t1;
#define LOAD_AT_2x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
#define LOAD_A_1x1(K, M) \
ra0 = vec_splats(A[((M+0)*lda)+K+0]);
#define LOAD_BTP_8x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb2, t0, t1); \
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
t0 = vec_mergeh(rb4, rb5); \
t1 = vec_mergeh(rb6, rb7); \
LOAD_PAIR(pb1, t0, t1); \
t0 = vec_mergel(rb4, rb5); \
t1 = vec_mergel(rb6, rb7); \
LOAD_PAIR(pb3, t0, t1);
#define LOAD_BTP_8x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1); \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \
LOAD_PAIR(pb1, rb0, rb1);
#define LOAD_BTP_4x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
t1 = vec_mergeh(rb2, rb3); \
LOAD_PAIR(pb0, t0, t1); \
t0 = vec_mergel(rb0, rb1); \
t1 = vec_mergel(rb2, rb3); \
LOAD_PAIR(pb1, t0, t1);
#define LOAD_BTP_4x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
rb1 = vec_xor(rb1, rb1); \
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
LOAD_PAIR(pb0, rb0, rb1);
#define LOAD_BTP_2x2(N, K) \
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
t0 = vec_mergeh(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \
t1 = vec_mergel(rb0, rb1); \
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
#define LOAD_BTP_2x1(N, K) \
rb0 = vec_xor(rb0, rb0); \
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_MMA_1ACC_(acc, b0, a0) \
__builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
BLASLONG k2 = K & ~1;
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (m = 0; m < m8; m += 8) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1,
ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6);
KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3,
ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7);
}
// workaround to avoid register spilling
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC_(acc0, pb0, ra0);
KERNEL_MMA_1ACC_(acc1, pb0, ra1);
LOAD_AT_4x1(m+4, k);
KERNEL_MMA_1ACC_(acc2, pb0, ra0);
KERNEL_MMA_1ACC_(acc3, pb0, ra1);
LOAD_AT_4x1(m, k);
LOAD_BTP_4x1(n+4, k);
KERNEL_MMA_1ACC_(acc4, pb0, ra0);
KERNEL_MMA_1ACC_(acc5, pb0, ra1);
LOAD_AT_4x1(m+4, k);
KERNEL_MMA_1ACC_(acc6, pb0, ra0);
KERNEL_MMA_1ACC_(acc7, pb0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc4, n+4, m+0);
SAVE_4x2_ACC(&acc6, n+4, m+4);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc3, n+0, m+6);
SAVE_4x2_ACC(&acc5, n+4, m+2);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_B_1x1(n, k);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2);
KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double rb0, rb1;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_B_1x1(n, k);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
KERNEL_MMA_1ACC(pb1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
KERNEL_MMA_1ACC(pb1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n, m);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_B_1x1(n, k);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
}
for (; m < M; m++) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
register vector double t0, t1;
__vector_pair pb0, pb1, pb2, pb3;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x2(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_8x1(n, k);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n+0, m+0);
SAVE_4x1_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_4x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x1_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0;
register vector double rb0, rb1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x2(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_A_1x1(k+1, m);
KERNEL_MMA_1ACC(pb1, ra0);
}
for (; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_BTP_2x1(n, k);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x1_ACC(&acc0, n+0, m+0);
}
for (; n < N; n++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[m*lda+k] * B[n*ldb+k];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,829 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !defined(B0)
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
#else
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_2x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1];
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#define LOAD_AT_8x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergeh(ra2, ra3); \
t2 = vec_mergel(ra0, ra1); \
t3 = vec_mergel(ra2, ra3); \
ra0 = t0; \
ra1 = t2; \
ra2 = t1; \
ra3 = t3; \
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
t0 = vec_mergeh(ra4, ra5); \
t1 = vec_mergeh(ra6, ra7); \
t2 = vec_mergel(ra4, ra5); \
t3 = vec_mergel(ra6, ra7); \
ra4 = t0; \
ra5 = t2; \
ra6 = t1; \
ra7 = t3;
#define LOAD_AT_8x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
ra2 = vec_xor(ra2, ra2); \
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
ra3 = vec_xor(ra3, ra3); \
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
#define LOAD_AT_4x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergeh(ra2, ra3); \
t2 = vec_mergel(ra0, ra1); \
t3 = vec_mergel(ra2, ra3); \
ra0 = t0; \
ra1 = t2; \
ra2 = t1; \
ra3 = t3;
#define LOAD_AT_4x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
ra1 = vec_xor(ra1, ra1); \
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
#define LOAD_AT_2x2(M, K) \
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
t0 = vec_mergeh(ra0, ra1); \
t1 = vec_mergel(ra0, ra1); \
ra0 = t0; \
ra1 = t1;
#define LOAD_AT_2x1(M, K) \
ra0 = vec_xor(ra0, ra0); \
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
#define LOAD_BP_1x8(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
#define LOAD_BP_1x4(K, N) \
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
#define LOAD_BP_1x2(K, N) \
t0 = vec_xl(0, B+((K)*ldb)+N); \
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
#define LOAD_B_1x8(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2); \
rb2 = vec_xl(0, B+(K*ldb)+N+4); \
rb3 = vec_xl(0, B+(K*ldb)+N+6); \
#define LOAD_B_1x4(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+2);
#define LOAD_B_1x2(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0);
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#define PACK_A(ra0, ra1, ra2, ra3, offset) \
vec_xst(ra0, 0, packA+(k*8)+0+offset); \
vec_xst(ra1, 0, packA+(k*8)+2+offset); \
vec_xst(ra2, 0, packA+(k*8)+4+offset); \
vec_xst(ra3, 0, packA+(k*8)+6+offset);
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
ra0 = vec_xl(0, packA+(k*8)+0+offset); \
ra1 = vec_xl(0, packA+(k*8)+2+offset); \
ra2 = vec_xl(0, packA+(k*8)+4+offset); \
ra3 = vec_xl(0, packA+(k*8)+6+offset);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
BLASLONG k2 = K & ~1;
#if defined(__GNUC__) && !defined(__clang__)
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
#else
int has_packing = 0;
#endif
double *packA;
if (has_packing) packA = (double *)malloc(K*8*sizeof(double));
vector double valpha = vec_splats(alpha);
#if !defined(B0)
vector double vbeta = vec_splats(beta);
#endif
for (m = 0; m < m8; m += 8) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
if (has_packing) {
if (n == 0) {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
PACK_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
PACK_A(ra1, ra3, ra5, ra7, 8);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_A(ra0, ra1, ra2, ra3, 0);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
}
for (; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc4, n+0, m+4);
SAVE_4x2_ACC(&acc6, n+0, m+6);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
SAVE_4x2_ACC(&acc5, n+4, m+4);
SAVE_4x2_ACC(&acc7, n+4, m+6);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_BP_1x4(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
SAVE_4x2_ACC(&acc2, n+0, m+4);
SAVE_4x2_ACC(&acc3, n+0, m+6);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
if (!has_packing) {
for (k = 0; k < k2; k += 2) {
LOAD_AT_8x2(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < k2; k += 2) {
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
}
for (; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_BP_1x2(k, n);
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
SAVE_2x2_ACC(&acc2, n+0, m+4);
SAVE_2x2_ACC(&acc3, n+0, m+6);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0, ra1, ra2, ra3;
register vector double rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_AT_8x1(m, k);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
SAVE_1x4_VSR(result2, n, m+4);
SAVE_1x4_VSR(result3, n, m+6);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0, t1, t2, t3;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc2, n+0, m+2);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc3, n+4, m+2);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+0, m+2);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1, ra2, ra3;
register vector double t0, t1, t2, t3;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_4x2(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
}
for (; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n+0, m+0);
SAVE_2x2_ACC(&acc1, n+0, m+2);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0, ra1;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_4x1(m, k);
LOAD_B_1x1(k, n);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
SAVE_1x4_VSR(result1, n, m+2);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector double ra0, ra1;
register vector double t0, t1;
__vector_pair pb0, pb1;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
LOAD_BP_1x8(k+1, n);
KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BP_1x8(k, n);
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double t0, t1;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_BP_1x4(k+1, n);
KERNEL_MMA_1ACC(pb0, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BP_1x4(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_4x2_ACC(&acc0, n, m);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector double ra0, ra1;
register vector double t0, t1;
__vector_pair pb0;
for (k = 0; k < k2; k += 2) {
LOAD_AT_2x2(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
LOAD_BP_1x2(k+1, n);
KERNEL_MMA_1ACC(pb0, ra1);
}
for (; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_BP_1x2(k, n);
KERNEL_MMA_1ACC(pb0, ra0);
}
#if !defined(B0)
register vector double rc0;
#endif
vector double result[4];
SAVE_2x2_ACC(&acc0, n, m);
}
for (; n < N; n++) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_AT_2x1(m, k);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector double rc0;
#endif
SAVE_1x4_VSR(result, n, m+0);
}
}
for (; m < M; m++) {
for (n = 0; n < n8; n += 8) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double result2 = ((vector double){0.,0.});
register vector double result3 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x1(m, k);
LOAD_B_1x8(k, n);
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
SAVE_4x1_VSR(result2, n+4, m);
SAVE_4x1_VSR(result3, n+6, m);
}
for (; n < n4; n += 4) {
register vector double result = ((vector double){0.,0.});
register vector double result1 = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x1(m, k);
LOAD_B_1x4(k, n);
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
}
SAVE_4x1_VSR(result, n, m);
SAVE_4x1_VSR(result1, n+2, m);
}
for (; n < n2; n += 2) {
register vector double result = ((vector double){0.,0.});
register vector double ra0;
register vector double rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(m, k);
LOAD_B_1x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_4x1_VSR(result, n, m);
}
for (; n < N; n++) {
FLOAT result = 0.0;
for (k = 0; k < K; k++) {
result += A[m*lda+k] * B[k*ldb+n];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
if(has_packing) free(packA);
return 0;
}

View File

@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
XXSPLTD_S(32,%x9,0) // alpha, alpha XXSPLTD_S(32,%x9,0) // alpha, alpha
"sldi %6, %13, 3 \n\t" // lda * sizeof (double) "sldi %6, %13, 3 \n\t" // lda * sizeof (double)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha
#else
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
#endif
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
"add %6, %6, %6 \n\t" // 2 * lda "add %6, %6, %6 \n\t" // 2 * lda
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
#endif
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
"add %10, %10, %10 \n\t" // 2 * lda "add %10, %10, %10 \n\t" // 2 * lda
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha
XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha
#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
#endif
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"one%=: \n\t" "one%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1 "lxvp 36, 0( %2) \n\t" // y0, y1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 37, 41, 32 \n\t"
#else
"xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t"
#endif
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
#else
"xvmaddadp 36, 42, 35 \n\t" "xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t" "xvmaddadp 37, 43, 35 \n\t"
#endif
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 44, 34 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
#else
"xvmaddadp 36, 44, 32 \n\t" "xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t" "xvmaddadp 37, 45, 32 \n\t"
#endif
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 46, 35 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
#else
"xvmaddadp 36, 46, 33 \n\t" "xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t" "xvmaddadp 37, 47, 33 \n\t"
#endif
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 50, 38 \n\t"
"xvmaddadp 37, 51, 38 \n\t"
#else
"xvmaddadp 36, 50, 48 \n\t" "xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t" "xvmaddadp 37, 51, 48 \n\t"
#endif
"lxvpx 50, %7, %11 \n\t" // a4[0] "lxvpx 50, %7, %11 \n\t" // a4[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 52, 39 \n\t"
"xvmaddadp 37, 53, 39 \n\t"
#else
"xvmaddadp 36, 52, 49 \n\t" "xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t" "xvmaddadp 37, 53, 49 \n\t"
#endif
"lxvpx 52, %8, %11 \n\t" // a5[0] "lxvpx 52, %8, %11 \n\t" // a5[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 54, 48 \n\t"
"xvmaddadp 37, 55, 48 \n\t"
#else
"xvmaddadp 36, 54, 38 \n\t" "xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t" "xvmaddadp 37, 55, 38 \n\t"
#endif
"lxvpx 54, %9, %11 \n\t" // a6[0] "lxvpx 54, %9, %11 \n\t" // a6[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 56, 49 \n\t"
"xvmaddadp 37, 57, 49 \n\t"
#else
"xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t"
#endif
"lxvpx 56, %10, %11 \n\t" // a7[0] "lxvpx 56, %10, %11 \n\t" // a7[0]
"addi %11, %11, 32 \n\t" "addi %11, %11, 32 \n\t"
@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"two%=: \n\t" "two%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1 "lxvp 36, 0( %2) \n\t" // y0, y1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 37, 41, 32 \n\t"
"xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
"xvmaddadp 36, 44, 34 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
"xvmaddadp 36, 46, 35 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
"xvmaddadp 36, 50, 38 \n\t"
"xvmaddadp 37, 51, 38 \n\t"
"xvmaddadp 36, 52, 39 \n\t"
"xvmaddadp 37, 53, 39 \n\t"
"xvmaddadp 36, 54, 48 \n\t"
"xvmaddadp 37, 55, 48 \n\t"
"xvmaddadp 36, 56, 49 \n\t"
"xvmaddadp 37, 57, 49 \n\t"
#else
"xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t"
"xvmaddadp 36, 42, 35 \n\t" "xvmaddadp 36, 42, 35 \n\t"
@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"xvmaddadp 37, 55, 38 \n\t" "xvmaddadp 37, 55, 38 \n\t"
"xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t"
#endif
"stxvp 36, 0( %2) \n\t" // y0, y1 "stxvp 36, 0( %2) \n\t" // y0, y1
: :

View File

@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvp 40, 32(%[y]) \n\t" "lxvp 40, 32(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(42,34,35)
XXMRGLD_S(43,34,35)
XXMRGHD_S(44,4,5)
XXMRGLD_S(45,4,5)
#else
XXMRGLD_S(42,35,34) XXMRGLD_S(42,35,34)
XXMRGHD_S(43,35,34) XXMRGHD_S(43,35,34)
XXMRGLD_S(44,5,4) XXMRGLD_S(44,5,4)
XXMRGHD_S(45,5,4) XXMRGHD_S(45,5,4)
#endif
"xvadddp 42,42,43 \n\t" "xvadddp 42,42,43 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(46,6,7)
XXMRGLD_S(47,6,7)
#else
XXMRGLD_S(46,7,6) XXMRGLD_S(46,7,6)
XXMRGHD_S(47,7,6) XXMRGHD_S(47,7,6)
#endif
"xvadddp 44,44,45 \n\t" "xvadddp 44,44,45 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(48,8,9)
XXMRGLD_S(49,8,9)
#else
XXMRGLD_S(48,9,8) XXMRGLD_S(48,9,8)
XXMRGHD_S(49,9,8) XXMRGHD_S(49,9,8)
#endif
"xvadddp 46,46,47 \n\t" "xvadddp 46,46,47 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 38,42,36 \n\t"
"xvmaddadp 39,44,36 \n\t"
#else
"xvmaddadp 39,42,36 \n\t" "xvmaddadp 39,42,36 \n\t"
"xvmaddadp 38,44,36 \n\t" "xvmaddadp 38,44,36 \n\t"
#endif
"xvadddp 48,48,49 \n\t" "xvadddp 48,48,49 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 41,48,36 \n\t"
#else
"xvmaddadp 41,46,36 \n\t" "xvmaddadp 41,46,36 \n\t"
#endif
"stxvp 38, 0(%[y]) \n\t" "stxvp 38, 0(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 40,46,36 \n\t"
#else
"xvmaddadp 40,48,36 \n\t" "xvmaddadp 40,48,36 \n\t"
#endif
"stxvp 40, 32(%[y]) \n\t" "stxvp 40, 32(%[y]) \n\t"
: [memy] "+m" (*(double (*)[8])y), : [memy] "+m" (*(double (*)[8])y),

View File

@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "drot_microk_power8.c" #include "drot_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "drot_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "drot_microk_power8.c" #include "drot_microk_power10.c"
#endif #endif
#endif #endif
@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 16 ) if ( n >= 16 )
{ {
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "dscal_microk_power8.c" #include "dscal_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "dscal_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "dscal_microk_power8.c" #include "dscal_microk_power10.c"
#endif #endif
#endif #endif
@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 ) if ( da == 0.0 )
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 16 ) if ( n >= 16 )
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else else
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 16 ) if ( n >= 16 )
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "dswap_microk_power8.c" #include "dswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "swap_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "dswap_microk_power8.c" #include "swap_microk_power10.c"
#endif #endif
#endif #endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 )) if ( (inc_x == 1) && (inc_y == 1 ))
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 32 ) if ( n >= 32 )
{ {
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -0,0 +1,84 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
{
double MNK = (double) M * (double) N * (double) K;
#if defined(DOUBLE) // dgemm
// gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This
// issue affects both dgemm_nn and dgemm_tn.
#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))
if (!transb)
return 0;
#endif
if (MNK <= 54.0*54.0*54.0)
return 1;
#else // sgemm
#if defined(__GNUC__) && defined(__clang__)
// clang generates code with register spilling for the region of code with
// packing, thus, we had to disable this optimization for clang. Given that
// the packing on-demand used in this work is one of the reasons that lead the
// small kernels to outperform the normal flow (when MNK increases), with it
// disabled we had to reduce the MNK inputs used by the code generated by clang.
if (MNK > 84.0*84.0*84.0)
return 0;
if (transa && !transb) {
// sgemm_tn works better when packing on-demand is used
if (MNK <= 64.0*64.0*64.0 && K >= 4)
return 1;
else
return 0;
}
#else // gcc
if (MNK > 100.0*100.0*100.0)
return 0;
#endif
// Multi-threading execution outperforms (or approaches) the execution of the
// small kernel.
if (num_cpu_avail(3) > 1) {
if (MNK <= 64.0*64.0*64.0)
return 1;
} else {
return 1;
}
#endif
return 0;
}

View File

@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "sasum_microk_power8.c" #include "sasum_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "sasum_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "sasum_microk_power8.c" #include "sasum_microk_power10.c"
#endif #endif
#endif #endif
@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 ) if ( inc_x == 1 )
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 32 ) if ( n >= 32 )
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,887 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
#if !defined(B0)
#define SAVE_4x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[2] = vec_madd(result[2], valpha, rc0); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result[3] = vec_madd(result[3], valpha, rc0); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[0] = vec_madd(result[0], valpha, rc0); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
rc0 = vec_mul(rc0, vbeta); \
result[1] = vec_madd(result[1], valpha, rc0); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
rc0 = vec_xl(0, C+((N)*ldc)+M); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_2x2_VSR(result, N, M) \
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst_len(result, C+(N*ldc)+M, 8); \
C[(N+1)*ldc+M+0] = result[2]; \
C[(N+1)*ldc+M+1] = result[3];
#define SAVE_1x2_VSR(result, N, M) \
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
rc0 = vec_mul(rc0, vbeta); \
result = vec_madd(result, valpha, rc0); \
vec_xst_len(result, C+(N*ldc)+M, 8);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
#define SAVE_2x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
#else
#define SAVE_4x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
result[2] = vec_mul(result[2], valpha); \
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
result[3] = vec_mul(result[3], valpha); \
vec_xst(result[3], 0, C+(N+3)*ldc+M);
#define SAVE_4x2_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
result[1] = vec_mul(result[1], valpha); \
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
result[2] = vec_mul(result[2], valpha); \
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
result[3] = vec_mul(result[3], valpha); \
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
#define SAVE_2x4_ACC(ACC, N, M) \
__builtin_mma_disassemble_acc((void *)result, ACC); \
result[0] = vec_mul(result[0], valpha); \
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
result[1] = vec_mul(result[1], valpha); \
vec_xst(result[1], 0, C+(N+1)*ldc+M);
#define SAVE_1x4_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst(result, 0, C+((N)*ldc)+M);
#define SAVE_2x2_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst_len(result, C+(N*ldc)+M, 8); \
C[(N+1)*ldc+M+0] = result[2]; \
C[(N+1)*ldc+M+1] = result[3];
#define SAVE_1x2_VSR(result, N, M) \
result = vec_mul(result, valpha); \
vec_xst_len(result, C+(N*ldc)+M, 8);
#define SAVE_4x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1]; \
C[(N+2)*ldc+M] = result[2]; \
C[(N+3)*ldc+M] = result[3];
#define SAVE_2x1_VSR(result, N, M) \
result = vec_mul(result, valpha); \
C[(N+0)*ldc+M] = result[0]; \
C[(N+1)*ldc+M] = result[1];
#endif
#define INIT_8ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3); \
__builtin_mma_xxsetaccz(&acc4); \
__builtin_mma_xxsetaccz(&acc5); \
__builtin_mma_xxsetaccz(&acc6); \
__builtin_mma_xxsetaccz(&acc7);
#define INIT_4ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1); \
__builtin_mma_xxsetaccz(&acc2); \
__builtin_mma_xxsetaccz(&acc3);
#define INIT_2ACCS() \
__builtin_mma_xxsetaccz(&acc0); \
__builtin_mma_xxsetaccz(&acc1);
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
#define LOAD_A_1x16(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+4); \
ra2 = vec_xl(0, A+(K*lda)+M+8); \
ra3 = vec_xl(0, A+(K*lda)+M+12);
#define LOAD_A_1x8(K, M) \
ra0 = vec_xl(0, A+(K*lda)+M+0); \
ra1 = vec_xl(0, A+(K*lda)+M+4);
#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
#define LOAD_A_2x2(K, M) \
ra0 = vec_splats(A[K*lda+M+0]); \
ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \
ra0 = vec_insert(A[K*lda+M+1], ra0, 3);
#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8);
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]);
#define LOAD_B_1x16(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+4); \
rb2 = vec_xl(0, B+(K*ldb)+N+8); \
rb3 = vec_xl(0, B+(K*ldb)+N+12);
#define LOAD_B_1x8(K, N) \
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
rb1 = vec_xl(0, B+(K*ldb)+N+4);
#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N);
#define LOAD_B_2x2(K, N) \
rb0 = vec_splats(B[K*ldb+N]); \
rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \
rb0 = vec_insert(B[K*ldb+N+1], rb0, 3);
#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8);
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
a0, a1, a2, a3, a4, a5, a6, a7) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \
__builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \
__builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \
__builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \
__builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
#define KERNEL_MMA_1ACC(b0, a0) \
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1); \
result2 = vec_madd(a2, b2, result2); \
result3 = vec_madd(a3, b3, result3);
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
result = vec_madd(a0, b0, result); \
result1 = vec_madd(a1, b1, result1);
#define KERNEL_VMADD_1VSR(a0, b0) \
result = vec_madd(a0, b0, result);
#define PACK_A(ra0, ra1, ra2, ra3, offset) \
vec_xst(ra0, 0, packA+(k*16)+0+offset); \
vec_xst(ra1, 0, packA+(k*16)+4+offset); \
vec_xst(ra2, 0, packA+(k*16)+8+offset); \
vec_xst(ra3, 0, packA+(k*16)+12+offset);
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
ra0 = vec_xl(0, packA+(k*16)+0+offset); \
ra1 = vec_xl(0, packA+(k*16)+4+offset); \
ra2 = vec_xl(0, packA+(k*16)+8+offset); \
ra3 = vec_xl(0, packA+(k*16)+12+offset);
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG m, n, k;
BLASLONG m16 = M & ~15;
BLASLONG m8 = M & ~7;
BLASLONG m4 = M & ~3;
BLASLONG m2 = M & ~1;
BLASLONG n16 = N & ~15;
BLASLONG n8 = N & ~7;
BLASLONG n4 = N & ~3;
BLASLONG n2 = N & ~1;
vector float valpha = vec_splats(alpha);
#if !defined(B0)
vector float vbeta = vec_splats(beta);
#endif
#if defined(__GNUC__) && !defined(__clang__)
int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0;
#else
int has_packing = 0;
#endif
float *packA;
if (has_packing) packA = (float *)malloc(K*16*sizeof(float));
for (m = 0; m < m16; m += 16) {
for (n = 0; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector float ra0, ra1, ra2, ra3;
register vector float rb0, rb1;
if (has_packing) {
if (n == 0) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
PACK_A(ra0, ra1, ra2, ra3, 0);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x8(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
} else {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
}
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc2, n+0, m+4);
SAVE_4x4_ACC(&acc4, n+0, m+8);
SAVE_4x4_ACC(&acc6, n+0, m+12);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc3, n+4, m+4);
SAVE_4x4_ACC(&acc5, n+4, m+8);
SAVE_4x4_ACC(&acc7, n+4, m+12);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0, ra1, ra2, ra3;
register vector float rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x4(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+0, m+4);
SAVE_4x4_ACC(&acc2, n+0, m+8);
SAVE_4x4_ACC(&acc3, n+0, m+12);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0, ra1, ra2, ra3;
register vector float rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x2(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x2(k, n);
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
}
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_2x4_ACC(&acc0, n, m+0);
SAVE_2x4_ACC(&acc1, n, m+4);
SAVE_2x4_ACC(&acc2, n, m+8);
SAVE_2x4_ACC(&acc3, n, m+12);
}
for (; n < N; n++) {
vector float result = ((vector float){0., 0., 0., 0.});
vector float result1 = ((vector float){0., 0., 0., 0.});
vector float result2 = ((vector float){0., 0., 0., 0.});
vector float result3 = ((vector float){0., 0., 0., 0.});
register vector float ra0, ra1, ra2, ra3;
register vector float rb0;
if (!has_packing) {
for (k = 0; k < K; k++) {
LOAD_A_1x16(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
} else {
for (k = 0; k < K; k++) {
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
LOAD_B_1x1(k, n);
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
}
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x4_VSR(result, n, m);
SAVE_1x4_VSR(result1, n, m+4);
SAVE_1x4_VSR(result2, n, m+8);
SAVE_1x4_VSR(result3, n, m+12);
}
}
for (; m < m8; m += 8) {
for (n = 0; n < n16; n += 16) {
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
INIT_8ACCS();
register vector float ra0, ra1;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x16(k, n);
KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc4, n+0, m+4);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc5, n+4, m+4);
SAVE_4x4_ACC(&acc2, n+8, m+0);
SAVE_4x4_ACC(&acc6, n+8, m+4);
SAVE_4x4_ACC(&acc3, n+12, m+0);
SAVE_4x4_ACC(&acc7, n+12, m+4);
}
for (; n < n8; n += 8) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0, ra1;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc2, n+0, m+4);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc3, n+4, m+4);
}
for (; n < n4; n += 4) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0, ra1;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+0, m+4);
}
for (; n < n2; n += 2) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0, ra1;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x2(k, n);
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_2x4_ACC(&acc0, n, m+0);
SAVE_2x4_ACC(&acc1, n, m+4);
}
for (; n < N; n++) {
vector float result = ((vector float){0.,0.,0.,0.});
vector float result1 = ((vector float){0.,0.,0.,0.});
register vector float ra0, ra1;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x8(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x4_VSR(result, n, m);
SAVE_1x4_VSR(result1, n, m+4);
}
}
for (; m < m4; m += 4) {
for (n = 0; n < n16; n += 16) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x16(k, n);
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+4, m+0);
SAVE_4x4_ACC(&acc2, n+8, m+0);
SAVE_4x4_ACC(&acc3, n+12, m+0);
}
for (; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
SAVE_4x4_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_1ACC(rb0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x4_ACC(&acc0, n+0, m+0);
}
for (; n < n2; n += 2) {
__vector_quad acc0;
INIT_1ACC();
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x2(k, n);
KERNEL_MMA_1ACC(rb0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_2x4_ACC(&acc0, n, m);
}
for (; n < N; n++) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x4(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x4_VSR(result, n, m);
}
}
for (; m < m2; m += 2) {
for (n = 0; n < n16; n += 16) {
__vector_quad acc0, acc1, acc2, acc3;
INIT_4ACCS();
register vector float ra0;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x16(k, n);
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
SAVE_4x2_ACC(&acc2, n+8, m+0);
SAVE_4x2_ACC(&acc3, n+12, m+0);
}
for (; n < n8; n += 8) {
__vector_quad acc0, acc1;
INIT_2ACCS();
register vector float ra0;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x8(k, n);
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
SAVE_4x2_ACC(&acc1, n+4, m+0);
}
for (; n < n4; n += 4) {
__vector_quad acc0;
INIT_1ACC();
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x4(k, n);
KERNEL_MMA_1ACC(rb0, ra0);
}
#if !defined(B0)
register vector float rc0;
#endif
vector float result[4];
SAVE_4x2_ACC(&acc0, n+0, m+0);
}
for (; n < n2; n += 2) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_2x2(k, m);
LOAD_B_2x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_2x2_VSR(result, n, m);
}
for (; n < N; n++) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x2(k, m);
LOAD_B_1x1(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
#if !defined(B0)
register vector float rc0;
#endif
SAVE_1x2_VSR(result, n, m);
}
}
for (; m < M; m++) {
for (n = 0; n < n16; n += 16) {
vector float result = ((vector float){0.,0.,0.,0.});
vector float result1 = ((vector float){0.,0.,0.,0.});
vector float result2 = ((vector float){0.,0.,0.,0.});
vector float result3 = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0, rb1, rb2, rb3;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x16(k, n);
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
}
SAVE_4x1_VSR(result, n+0, m);
SAVE_4x1_VSR(result1, n+4, m);
SAVE_4x1_VSR(result2, n+8, m);
SAVE_4x1_VSR(result3, n+12, m);
}
for (; n < n8; n += 8) {
vector float result = ((vector float){0.,0.,0.,0.});
vector float result1 = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0, rb1;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x8(k, n);
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
}
SAVE_4x1_VSR(result, n+0, m);
SAVE_4x1_VSR(result1, n+4, m);
}
for (; n < n4; n += 4) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x4(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_4x1_VSR(result, n+0, m);
}
for (; n < n2; n += 2) {
vector float result = ((vector float){0.,0.,0.,0.});
register vector float ra0;
register vector float rb0;
for (k = 0; k < K; k++) {
LOAD_A_1x1(k, m);
LOAD_B_1x2(k, n);
KERNEL_VMADD_1VSR(ra0, rb0);
}
SAVE_2x1_VSR(result, n+0, m);
}
for (; n < N; n++) {
FLOAT result = 0.0f;
for (k = 0; k < K; k++) {
result += A[k*lda+m] * B[k*ldb+n];
}
result = result * alpha;
#if !defined(B0)
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
#else
C[n*ldc+m] = result;
#endif
}
}
if (has_packing) free (packA);
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "srot_microk_power8.c" #include "srot_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "srot_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "srot_microk_power8.c" #include "srot_microk_power10.c"
#endif #endif
#endif #endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 16 ) if ( n >= 16 )
{ {
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "sscal_microk_power8.c" #include "sscal_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "sscal_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "sscal_microk_power8.c" #include "sscal_microk_power10.c"
#endif #endif
#endif #endif
@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 ) if ( da == 0.0 )
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 32 ) if ( n >= 32 )
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else else
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 32 ) if ( n >= 32 )
{ {
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#include "sswap_microk_power8.c" #include "sswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "swap_microk_power10.c"
#elif defined(POWER10) #elif defined(POWER10)
#include "sswap_microk_power8.c" #include "swap_microk_power10.c"
#endif #endif
#endif #endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 )) if ( (inc_x == 1) && (inc_y == 1 ))
{ {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(POWER10)
if ( n >= 64 ) if ( n >= 64 )
{ {
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
double alpha_r, double alpha_i) double alpha_r, double alpha_i)
{ {
#if !defined(CONJ) #if !defined(CONJ)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static const double mvec[2] = { -1.0, 1.0 };
#else
static const double mvec[2] = { 1.0, -1.0 };
#endif
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static const double mvec[2] = { 1.0, -1.0 }; static const double mvec[2] = { 1.0, -1.0 };
#else #else
static const double mvec[2] = { -1.0, 1.0 }; static const double mvec[2] = { -1.0, 1.0 };
#endif
#endif #endif
const double *mvecp = mvec; const double *mvecp = mvec;

Some files were not shown because too many files have changed in this diff Show More