Merge pull request #3488 from xianyi/develop
Update from develop branch for 0.3.19 release
This commit is contained in:
commit
488911486a
341
CMakeLists.txt
341
CMakeLists.txt
|
|
@ -3,10 +3,13 @@
|
|||
##
|
||||
|
||||
cmake_minimum_required(VERSION 2.8.5)
|
||||
|
||||
project(OpenBLAS C ASM)
|
||||
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 19)
|
||||
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
|
@ -20,51 +23,68 @@ endif()
|
|||
|
||||
#######
|
||||
if(MSVC)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
endif()
|
||||
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||
|
||||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||
|
||||
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
set(NO_AFFINITY 1)
|
||||
set(NO_AFFINITY 1)
|
||||
endif()
|
||||
|
||||
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
|
||||
|
||||
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
|
||||
option(BUILD_STATIC_LIBS "Build static library" OFF)
|
||||
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
|
||||
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
|
||||
endif()
|
||||
if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC)
|
||||
message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS")
|
||||
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE)
|
||||
endif()
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoids conflicts with other BLAS libraries, especially when using
|
||||
# 64 bit integer interfaces in OpenBLAS.
|
||||
|
||||
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
|
||||
|
||||
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
|
||||
|
||||
#######
|
||||
if(BUILD_WITHOUT_LAPACK)
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
endif()
|
||||
|
||||
if(BUILD_WITHOUT_CBLAS)
|
||||
set(NO_CBLAS 1)
|
||||
set(NO_CBLAS 1)
|
||||
endif()
|
||||
|
||||
#######
|
||||
|
||||
if(MSVC AND MSVC_STATIC_CRT)
|
||||
set(CompilerFlags
|
||||
CMAKE_CXX_FLAGS
|
||||
CMAKE_CXX_FLAGS_DEBUG
|
||||
CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_RELEASE
|
||||
)
|
||||
foreach(CompilerFlag ${CompilerFlags})
|
||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||
endforeach()
|
||||
set(CompilerFlags
|
||||
CMAKE_CXX_FLAGS
|
||||
CMAKE_CXX_FLAGS_DEBUG
|
||||
CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_RELEASE
|
||||
)
|
||||
foreach(CompilerFlag ${CompilerFlags})
|
||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
|
@ -98,7 +118,7 @@ endif ()
|
|||
# set which float types we want to build for
|
||||
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
|
||||
# if none are defined, build for all
|
||||
# set(BUILD_BFLOAT16 true)
|
||||
# set(BUILD_BFLOAT16 true)
|
||||
set(BUILD_SINGLE true)
|
||||
set(BUILD_DOUBLE true)
|
||||
set(BUILD_COMPLEX true)
|
||||
|
|
@ -143,9 +163,10 @@ endif ()
|
|||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
if(MSVC)
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
|
||||
endif ()
|
||||
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
|
|
@ -183,12 +204,61 @@ if (${DYNAMIC_ARCH})
|
|||
endif ()
|
||||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
if(NOT NO_LAPACK)
|
||||
add_library(LAPACK OBJECT ${LA_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>")
|
||||
endif()
|
||||
if(NOT NO_LAPACKE)
|
||||
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
|
||||
endif()
|
||||
if(BUILD_RELAPACK)
|
||||
add_library(RELAPACK OBJECT ${RELA_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
|
||||
endif()
|
||||
set(OpenBLAS_LIBS "")
|
||||
if(BUILD_STATIC_LIBS)
|
||||
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static)
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared)
|
||||
endif()
|
||||
if(BUILD_STATIC_LIBS)
|
||||
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static)
|
||||
else()
|
||||
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared)
|
||||
endif()
|
||||
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} m)
|
||||
if(BUILD_STATIC_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_static m)
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared m)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
|
||||
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
if (NOT NOFORTRAN)
|
||||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
|
||||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
|
||||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
|
||||
else ()
|
||||
set (CMAKE_C_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
# Handle MSVC exports
|
||||
|
|
@ -197,21 +267,21 @@ if(MSVC AND BUILD_SHARED_LIBS)
|
|||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
||||
else()
|
||||
# Creates verbose .def file (51KB vs 18KB)
|
||||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
|
||||
set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
|
|
@ -220,10 +290,17 @@ if (USE_THREAD)
|
|||
# Add threading library to linker
|
||||
find_package(Threads)
|
||||
if (THREADS_HAVE_PTHREAD_ARG)
|
||||
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread")
|
||||
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||
COMPILE_OPTIONS "-pthread"
|
||||
INTERFACE_COMPILE_OPTIONS "-pthread"
|
||||
)
|
||||
endif()
|
||||
if(BUILD_STATIC_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
|
||||
#if (MSVC OR NOT NOFORTRAN)
|
||||
|
|
@ -239,97 +316,109 @@ if (NOT NOFORTRAN)
|
|||
add_subdirectory(ctest)
|
||||
endif()
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
|
||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
|
||||
else()
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
else()
|
||||
set(ARCH_IN ${ARCH})
|
||||
endif()
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
else()
|
||||
set(ARCH_IN ${ARCH})
|
||||
endif()
|
||||
|
||||
if (${CORE} STREQUAL "generic")
|
||||
set(ARCH_IN "GENERIC")
|
||||
endif ()
|
||||
if (${CORE} STREQUAL "generic")
|
||||
set(ARCH_IN "GENERIC")
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED EXPRECISION)
|
||||
set(EXPRECISION_IN 0)
|
||||
else()
|
||||
set(EXPRECISION_IN ${EXPRECISION})
|
||||
endif()
|
||||
if (NOT DEFINED EXPRECISION)
|
||||
set(EXPRECISION_IN 0)
|
||||
else()
|
||||
set(EXPRECISION_IN ${EXPRECISION})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_CBLAS)
|
||||
set(NO_CBLAS_IN 0)
|
||||
else()
|
||||
set(NO_CBLAS_IN ${NO_CBLAS})
|
||||
endif()
|
||||
if (NOT DEFINED NO_CBLAS)
|
||||
set(NO_CBLAS_IN 0)
|
||||
else()
|
||||
set(NO_CBLAS_IN ${NO_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACK)
|
||||
set(NO_LAPACK_IN 0)
|
||||
else()
|
||||
set(NO_LAPACK_IN ${NO_LAPACK})
|
||||
endif()
|
||||
if (NOT DEFINED NO_LAPACK)
|
||||
set(NO_LAPACK_IN 0)
|
||||
else()
|
||||
set(NO_LAPACK_IN ${NO_LAPACK})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACKE)
|
||||
set(NO_LAPACKE_IN 0)
|
||||
else()
|
||||
set(NO_LAPACKE_IN ${NO_LAPACKE})
|
||||
endif()
|
||||
if (NOT DEFINED NO_LAPACKE)
|
||||
set(NO_LAPACKE_IN 0)
|
||||
else()
|
||||
set(NO_LAPACKE_IN ${NO_LAPACKE})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NEED2UNDERSCORES)
|
||||
set(NEED2UNDERSCORES_IN 0)
|
||||
else()
|
||||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
|
||||
endif()
|
||||
if (NOT DEFINED NEED2UNDERSCORES)
|
||||
set(NEED2UNDERSCORES_IN 0)
|
||||
else()
|
||||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED ONLY_CBLAS)
|
||||
set(ONLY_CBLAS_IN 0)
|
||||
else()
|
||||
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||
endif()
|
||||
if (NOT DEFINED ONLY_CBLAS)
|
||||
set(ONLY_CBLAS_IN 0)
|
||||
else()
|
||||
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED BU)
|
||||
set(BU _)
|
||||
endif()
|
||||
if (NOT DEFINED BU)
|
||||
set(BU _)
|
||||
endif()
|
||||
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Install project
|
||||
|
||||
# Install libraries
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}_shared
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}_static
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
else()
|
||||
install(TARGETS ${OpenBLAS_LIBS}
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
endif()
|
||||
|
||||
# Install headers
|
||||
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
|
|
@ -365,36 +454,41 @@ if(NOT NOFORTRAN)
|
|||
endif()
|
||||
|
||||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT NO_LAPACKE)
|
||||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
|
||||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
|
||||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
if(BUILD_STATIC_LIBS)
|
||||
add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke)
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke)
|
||||
endif()
|
||||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
|
||||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
endif()
|
||||
|
||||
# Install pkg-config files
|
||||
|
|
@ -419,4 +513,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
|||
install(EXPORT "${PN}${SUFFIX64}Targets"
|
||||
NAMESPACE "${PN}${SUFFIX64}::"
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
|
||||
|
|
|
|||
|
|
@ -197,3 +197,7 @@ In chronological order:
|
|||
|
||||
* River Dillon <oss@outerpassage.net>
|
||||
* [2021-07-10] fix compilation with musl libc
|
||||
|
||||
* Bine Brank <https://github.com/binebrank>
|
||||
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
|
||||
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
|
||||
|
|
|
|||
|
|
@ -1,4 +1,51 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.19
|
||||
19-Dec-2021
|
||||
|
||||
general:
|
||||
- reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16
|
||||
- fixed a potential thread race in the thread buffer reallocation routines
|
||||
that were introduced in 0.3.18
|
||||
- fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE
|
||||
- fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG
|
||||
- made automatic library suffix for CMAKE builds with INTERFACE64 available
|
||||
to CBLAS-only builds
|
||||
|
||||
x86_64:
|
||||
- DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities
|
||||
when an unknown CPUID is encountered, instead of defaulting to Prescott
|
||||
- added cpu detection for Intel Alder Lake
|
||||
- added cpu detection for Intel Sapphire Rapids
|
||||
- added an optimized SBGEMM kernel for Sapphire Rapids
|
||||
- fixed DYNAMIC_ARCH builds on OSX with CMAKE
|
||||
- worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX
|
||||
- fixed missing thread initialization for static builds on Windows/MSVC
|
||||
- fixed an excessive read in ZSYMV
|
||||
|
||||
POWER:
|
||||
- added support for POWER10 in big-endian mode
|
||||
- added support for building with CMAKE
|
||||
- added optimized SGEMM and DGEMM kernels for small matrix sizes
|
||||
|
||||
ARMV8:
|
||||
- added basic support and cputype detection for Fujitsu A64FX
|
||||
- added a generic ARMV8SVE target
|
||||
- added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX
|
||||
- added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus
|
||||
- fixed cpuid detection for Apple M1 and improved performance
|
||||
- improved compiler flag setting in CMAKE builds
|
||||
|
||||
RISCV64:
|
||||
- fixed improper initialization in CSCAL/ZSCAL for strided access patterns
|
||||
|
||||
MIPS:
|
||||
- added a GENERIC target for MIPS32
|
||||
- added support for cross-compiling to MIPS32 on x86_64 using CMAKE
|
||||
|
||||
MIPS64:
|
||||
- fixed misdetection of MSA capability
|
||||
|
||||
====================================================================
|
||||
Version 0.3.18
|
||||
02-Oct-2021
|
||||
|
|
|
|||
2
Makefile
2
Makefile
|
|
@ -32,7 +32,7 @@ export NOFORTRAN
|
|||
export NO_LAPACK
|
||||
endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
ifneq ($(C_COMPILER), PGI)
|
||||
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
ISCLANG=1
|
||||
endif
|
||||
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
|
|
@ -17,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV8SVE)
|
||||
CCOMMON_OPT += -march=armv8-a+sve
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a+sve
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
|
@ -48,7 +58,7 @@ endif
|
|||
# Use a72 tunings because Neoverse-N1 is only available
|
||||
# in GCC>=9
|
||||
ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
|
@ -70,7 +80,7 @@ endif
|
|||
|
||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||
ifeq ($(CORE), CORTEXA55)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ8), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
|
@ -132,7 +142,7 @@ FCOMMON_OPT += -march=armv8.3-a
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
|
@ -150,6 +160,15 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), A64FX)
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.18
|
||||
VERSION = 0.3.18.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
|||
|
|
@ -9,11 +9,10 @@ ifndef TOPDIR
|
|||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
ifndef ARCH
|
||||
# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
|
||||
HOSTARCH := $(shell uname -m)
|
||||
else
|
||||
HOSTARCH = $(ARCH)
|
||||
ifeq ($(HOSTARCH), amd64)
|
||||
HOSTARCH=x86_64
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
|
|
@ -102,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET
|
|||
ifeq ($(TARGET), GENERIC)
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
override NO_EXPRECISION=1
|
||||
export NO_EXPRECiSION
|
||||
export NO_EXPRECISION
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
|
@ -119,6 +118,9 @@ endif
|
|||
ifeq ($(TARGET), COOPERLAKE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SAPPHIRERAPIDS)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
|
|
@ -143,8 +145,13 @@ endif
|
|||
ifeq ($(TARGET), POWER8)
|
||||
GETARCH_FLAGS := -DFORCE_POWER6
|
||||
endif
|
||||
ifeq ($(TARGET), POWER9)
|
||||
GETARCH_FLAGS := -DFORCE_POWER6
|
||||
endif
|
||||
ifeq ($(TARGET), POWER10)
|
||||
GETARCH_FLAGS := -DFORCE_POWER6
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
|
||||
#
|
||||
|
|
@ -164,6 +171,9 @@ endif
|
|||
ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
|
|
@ -251,6 +261,8 @@ endif
|
|||
#For small matrix optimization
|
||||
ifeq ($(ARCH), x86_64)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
else ifeq ($(CORE), POWER10)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
endif
|
||||
ifeq ($(SMALL_MATRIX_OPT), 1)
|
||||
CCOMMON_OPT += -DSMALL_MATRIX_OPT
|
||||
|
|
@ -260,6 +272,10 @@ endif
|
|||
ifndef GOTOBLAS_MAKEFILE
|
||||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
||||
# Determine if the assembler is GNU Assembler
|
||||
HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
|
||||
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
|
||||
|
||||
# Generating Makefile.conf and config.h
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
|
||||
|
|
@ -307,7 +323,7 @@ else
|
|||
SMP = 1
|
||||
endif
|
||||
else
|
||||
ifeq ($(NUM_THREAD), 1)
|
||||
ifeq ($(NUM_THREADS), 1)
|
||||
SMP =
|
||||
else
|
||||
SMP = 1
|
||||
|
|
@ -892,15 +908,25 @@ endif
|
|||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
|
||||
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
|
||||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
|
||||
PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
|
||||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
|
||||
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
|
||||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
|
||||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
|
||||
NEWPGI := 1
|
||||
PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
|
||||
PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
|
||||
PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
|
||||
ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
|
||||
NEWPGI2 := 1
|
||||
endif
|
||||
endif
|
||||
ifdef BINARY64
|
||||
ifeq ($(ARCH), x86_64)
|
||||
ifneq ($(NEWPGI2),1)
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
CCOMMON_OPT += -tp px
|
||||
endif
|
||||
ifneq ($(NEWPGI),1)
|
||||
CCOMMON_OPT += -D__MMX__ -Mnollvm
|
||||
endif
|
||||
|
|
@ -915,7 +941,11 @@ endif
|
|||
endif
|
||||
endif
|
||||
else
|
||||
ifneq ($(NEWPGI2),1)
|
||||
CCOMMON_OPT += -tp p7
|
||||
else
|
||||
CCOMMON_OPT += -tp px
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
@ -1092,8 +1122,12 @@ FCOMMON_OPT += -i8
|
|||
endif
|
||||
endif
|
||||
ifeq ($(ARCH), x86_64)
|
||||
ifneq ($(NEWPGI2),1)
|
||||
FCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
FCOMMON_OPT += -tp px
|
||||
endif
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER6)
|
||||
$(warning NVIDIA HPC compilers do not support POWER6.)
|
||||
|
|
@ -1643,8 +1677,10 @@ export HAVE_VFP
|
|||
export HAVE_VFPV3
|
||||
export HAVE_VFPV4
|
||||
export HAVE_NEON
|
||||
export HAVE_MSA
|
||||
export MSA_FLAGS
|
||||
ifndef NO_MSA
|
||||
export HAVE_MSA
|
||||
export MSA_FLAGS
|
||||
endif
|
||||
export KERNELDIR
|
||||
export FUNCTION_PROFILE
|
||||
export TARGET_CORE
|
||||
|
|
|
|||
|
|
@ -81,6 +81,40 @@ CCOMMON_OPT += -march=cooperlake
|
|||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SAPPHIRERAPIDS)
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# sapphire rapids support was added in 11
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ HASWELL
|
|||
SKYLAKEX
|
||||
ATOM
|
||||
COOPERLAKE
|
||||
SAPPHIRERAPIDS
|
||||
|
||||
b)AMD CPU:
|
||||
ATHLON
|
||||
|
|
|
|||
23
appveyor.yml
23
appveyor.yml
|
|
@ -29,15 +29,15 @@ environment:
|
|||
global:
|
||||
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
|
||||
matrix:
|
||||
- COMPILER: clang-cl
|
||||
WITH_FORTRAN: ON
|
||||
- COMPILER: clang-cl
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: OFF
|
||||
- COMPILER: cl
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
WITH_FORTRAN: ignore
|
||||
# - COMPILER: clang-cl
|
||||
# WITH_FORTRAN: ON
|
||||
# - COMPILER: clang-cl
|
||||
# DYNAMIC_ARCH: ON
|
||||
# WITH_FORTRAN: OFF
|
||||
# - COMPILER: cl
|
||||
# - COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
# DYNAMIC_ARCH: OFF
|
||||
# WITH_FORTRAN: ignore
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-6.3.0-32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
|
|
@ -46,6 +46,7 @@ environment:
|
|||
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda update --yes -n base conda
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
|
||||
|
|
@ -64,8 +65,8 @@ before_build:
|
|||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
|
|
|
|||
|
|
@ -75,7 +75,50 @@ jobs:
|
|||
cd utest
|
||||
dir
|
||||
openblas_utest.exe
|
||||
|
||||
|
||||
- job: Windows_mingw_gmake
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
steps:
|
||||
- script: |
|
||||
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL"
|
||||
|
||||
- job: Windows_clang_cmake
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
steps:
|
||||
- script: |
|
||||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
|
||||
set "LIB=C:\Miniconda\Library\lib;%LIB%"
|
||||
set "CPATH=C:\Miniconda\Library\include;%CPATH%
|
||||
conda config --add channels conda-forge --force
|
||||
conda config --set auto_update_conda false
|
||||
conda install --yes ninja
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
- job: Windows_flang_clang
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
steps:
|
||||
- script: |
|
||||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
|
||||
set "LIB=C:\Miniconda\Library\lib;%LIB%"
|
||||
set "CPATH=C:\Miniconda\Library\include;%CPATH%"
|
||||
conda config --add channels conda-forge --force
|
||||
conda config --set auto_update_conda false
|
||||
conda install --yes --quiet ninja flang
|
||||
mkdir build
|
||||
cd build
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
- job: OSX_OpenMP
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
|
|
@ -122,7 +165,7 @@ jobs:
|
|||
make
|
||||
ctest
|
||||
|
||||
- job: OSX_OpenMP_Clang_gf_cmake
|
||||
- job: OSX_dynarch_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
|
|
@ -130,14 +173,12 @@ jobs:
|
|||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 ..
|
||||
make
|
||||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
|
||||
cmake --build .
|
||||
ctest
|
||||
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
|
|
@ -179,7 +220,7 @@ jobs:
|
|||
brew update
|
||||
brew install --cask android-ndk
|
||||
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
|
||||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
|
||||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
|
||||
|
||||
- job: OSX_IOS_ARMV8
|
||||
pool:
|
||||
|
|
@ -206,9 +247,9 @@ jobs:
|
|||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \
|
||||
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
|
||||
&& echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ if (${ARCH} STREQUAL "ia64")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (MIPS64)
|
||||
if (MIPS32 OR MIPS64)
|
||||
set(NO_BINARY_MODE 1)
|
||||
endif ()
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
|
|||
|
||||
if (NO_BINARY_MODE)
|
||||
|
||||
if (MIPS32)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32")
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
if (MIPS64)
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64")
|
||||
|
|
@ -126,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL SAPPHIRERAPIDS)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL A64FX)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL ARMV8SVE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER10)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." )
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER9)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER8)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")
|
||||
|
|
|
|||
|
|
@ -3,11 +3,6 @@
|
|||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (INTERFACE64)
|
||||
set(SUFFIX64 64)
|
||||
set(SUFFIX64_UNDERSCORE _64)
|
||||
endif()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
|
|
|
|||
|
|
@ -1,214 +1,218 @@
|
|||
# helper functions for the kernel CMakeLists.txt
|
||||
|
||||
function(SetFallback KERNEL SOURCE_PATH)
|
||||
if (NOT (DEFINED ${KERNEL}))
|
||||
set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE)
|
||||
endif ()
|
||||
endfunction()
|
||||
|
||||
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
|
||||
macro(SetDefaultL1)
|
||||
set(SAMAXKERNEL amax.S)
|
||||
set(DAMAXKERNEL amax.S)
|
||||
set(QAMAXKERNEL amax.S)
|
||||
set(CAMAXKERNEL zamax.S)
|
||||
set(ZAMAXKERNEL zamax.S)
|
||||
set(XAMAXKERNEL zamax.S)
|
||||
set(SAMINKERNEL amin.S)
|
||||
set(DAMINKERNEL amin.S)
|
||||
set(QAMINKERNEL amin.S)
|
||||
set(CAMINKERNEL zamin.S)
|
||||
set(ZAMINKERNEL zamin.S)
|
||||
set(XAMINKERNEL zamin.S)
|
||||
set(SMAXKERNEL max.S)
|
||||
set(DMAXKERNEL max.S)
|
||||
set(QMAXKERNEL max.S)
|
||||
set(SMINKERNEL min.S)
|
||||
set(DMINKERNEL min.S)
|
||||
set(QMINKERNEL min.S)
|
||||
set(ISAMAXKERNEL iamax.S)
|
||||
set(IDAMAXKERNEL iamax.S)
|
||||
set(IQAMAXKERNEL iamax.S)
|
||||
set(ICAMAXKERNEL izamax.S)
|
||||
set(IZAMAXKERNEL izamax.S)
|
||||
set(IXAMAXKERNEL izamax.S)
|
||||
set(ISAMINKERNEL iamin.S)
|
||||
set(IDAMINKERNEL iamin.S)
|
||||
set(IQAMINKERNEL iamin.S)
|
||||
set(ICAMINKERNEL izamin.S)
|
||||
set(IZAMINKERNEL izamin.S)
|
||||
set(IXAMINKERNEL izamin.S)
|
||||
set(ISMAXKERNEL iamax.S)
|
||||
set(IDMAXKERNEL iamax.S)
|
||||
set(IQMAXKERNEL iamax.S)
|
||||
set(ISMINKERNEL iamin.S)
|
||||
set(IDMINKERNEL iamin.S)
|
||||
set(IQMINKERNEL iamin.S)
|
||||
set(SASUMKERNEL asum.S)
|
||||
set(DASUMKERNEL asum.S)
|
||||
set(CASUMKERNEL zasum.S)
|
||||
set(ZASUMKERNEL zasum.S)
|
||||
set(QASUMKERNEL asum.S)
|
||||
set(XASUMKERNEL zasum.S)
|
||||
set(SAXPYKERNEL axpy.S)
|
||||
set(DAXPYKERNEL axpy.S)
|
||||
set(CAXPYKERNEL zaxpy.S)
|
||||
set(ZAXPYKERNEL zaxpy.S)
|
||||
set(QAXPYKERNEL axpy.S)
|
||||
set(XAXPYKERNEL zaxpy.S)
|
||||
set(SCOPYKERNEL copy.S)
|
||||
set(DCOPYKERNEL copy.S)
|
||||
set(CCOPYKERNEL zcopy.S)
|
||||
set(ZCOPYKERNEL zcopy.S)
|
||||
set(QCOPYKERNEL copy.S)
|
||||
set(XCOPYKERNEL zcopy.S)
|
||||
set(SDOTKERNEL dot.S)
|
||||
set(DDOTKERNEL dot.S)
|
||||
set(CDOTKERNEL zdot.S)
|
||||
set(ZDOTKERNEL zdot.S)
|
||||
set(QDOTKERNEL dot.S)
|
||||
set(XDOTKERNEL zdot.S)
|
||||
set(SNRM2KERNEL nrm2.S)
|
||||
set(DNRM2KERNEL nrm2.S)
|
||||
set(QNRM2KERNEL nrm2.S)
|
||||
set(CNRM2KERNEL znrm2.S)
|
||||
set(ZNRM2KERNEL znrm2.S)
|
||||
set(XNRM2KERNEL znrm2.S)
|
||||
set(SROTKERNEL rot.S)
|
||||
set(DROTKERNEL rot.S)
|
||||
set(QROTKERNEL rot.S)
|
||||
set(CROTKERNEL zrot.S)
|
||||
set(ZROTKERNEL zrot.S)
|
||||
set(XROTKERNEL zrot.S)
|
||||
set(SSCALKERNEL scal.S)
|
||||
set(DSCALKERNEL scal.S)
|
||||
set(CSCALKERNEL zscal.S)
|
||||
set(ZSCALKERNEL zscal.S)
|
||||
set(QSCALKERNEL scal.S)
|
||||
set(XSCALKERNEL zscal.S)
|
||||
set(SSWAPKERNEL swap.S)
|
||||
set(DSWAPKERNEL swap.S)
|
||||
set(CSWAPKERNEL zswap.S)
|
||||
set(ZSWAPKERNEL zswap.S)
|
||||
set(QSWAPKERNEL swap.S)
|
||||
set(XSWAPKERNEL zswap.S)
|
||||
set(SGEMVNKERNEL gemv_n.S)
|
||||
set(SGEMVTKERNEL gemv_t.S)
|
||||
set(DGEMVNKERNEL gemv_n.S)
|
||||
set(DGEMVTKERNEL gemv_t.S)
|
||||
set(CGEMVNKERNEL zgemv_n.S)
|
||||
set(CGEMVTKERNEL zgemv_t.S)
|
||||
set(ZGEMVNKERNEL zgemv_n.S)
|
||||
set(ZGEMVTKERNEL zgemv_t.S)
|
||||
set(QGEMVNKERNEL gemv_n.S)
|
||||
set(QGEMVTKERNEL gemv_t.S)
|
||||
set(XGEMVNKERNEL zgemv_n.S)
|
||||
set(XGEMVTKERNEL zgemv_t.S)
|
||||
set(SCABS_KERNEL ../generic/cabs.c)
|
||||
set(DCABS_KERNEL ../generic/cabs.c)
|
||||
set(QCABS_KERNEL ../generic/cabs.c)
|
||||
set(LSAME_KERNEL ../generic/lsame.c)
|
||||
set(SAXPBYKERNEL ../arm/axpby.c)
|
||||
set(DAXPBYKERNEL ../arm/axpby.c)
|
||||
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(SSUMKERNEL sum.S)
|
||||
set(DSUMKERNEL sum.S)
|
||||
set(CSUMKERNEL zsum.S)
|
||||
set(ZSUMKERNEL zsum.S)
|
||||
set(QSUMKERNEL sum.S)
|
||||
set(XSUMKERNEL zsum.S)
|
||||
SetFallback(SAMAXKERNEL amax.S)
|
||||
SetFallback(DAMAXKERNEL amax.S)
|
||||
SetFallback(QAMAXKERNEL amax.S)
|
||||
SetFallback(CAMAXKERNEL zamax.S)
|
||||
SetFallback(ZAMAXKERNEL zamax.S)
|
||||
SetFallback(XAMAXKERNEL zamax.S)
|
||||
SetFallback(SAMINKERNEL amin.S)
|
||||
SetFallback(DAMINKERNEL amin.S)
|
||||
SetFallback(QAMINKERNEL amin.S)
|
||||
SetFallback(CAMINKERNEL zamin.S)
|
||||
SetFallback(ZAMINKERNEL zamin.S)
|
||||
SetFallback(XAMINKERNEL zamin.S)
|
||||
SetFallback(SMAXKERNEL max.S)
|
||||
SetFallback(DMAXKERNEL max.S)
|
||||
SetFallback(QMAXKERNEL max.S)
|
||||
SetFallback(SMINKERNEL min.S)
|
||||
SetFallback(DMINKERNEL min.S)
|
||||
SetFallback(QMINKERNEL min.S)
|
||||
SetFallback(ISAMAXKERNEL iamax.S)
|
||||
SetFallback(IDAMAXKERNEL iamax.S)
|
||||
SetFallback(IQAMAXKERNEL iamax.S)
|
||||
SetFallback(ICAMAXKERNEL izamax.S)
|
||||
SetFallback(IZAMAXKERNEL izamax.S)
|
||||
SetFallback(IXAMAXKERNEL izamax.S)
|
||||
SetFallback(ISAMINKERNEL iamin.S)
|
||||
SetFallback(IDAMINKERNEL iamin.S)
|
||||
SetFallback(IQAMINKERNEL iamin.S)
|
||||
SetFallback(ICAMINKERNEL izamin.S)
|
||||
SetFallback(IZAMINKERNEL izamin.S)
|
||||
SetFallback(IXAMINKERNEL izamin.S)
|
||||
SetFallback(ISMAXKERNEL iamax.S)
|
||||
SetFallback(IDMAXKERNEL iamax.S)
|
||||
SetFallback(IQMAXKERNEL iamax.S)
|
||||
SetFallback(ISMINKERNEL iamin.S)
|
||||
SetFallback(IDMINKERNEL iamin.S)
|
||||
SetFallback(IQMINKERNEL iamin.S)
|
||||
SetFallback(SASUMKERNEL asum.S)
|
||||
SetFallback(DASUMKERNEL asum.S)
|
||||
SetFallback(CASUMKERNEL zasum.S)
|
||||
SetFallback(ZASUMKERNEL zasum.S)
|
||||
SetFallback(QASUMKERNEL asum.S)
|
||||
SetFallback(XASUMKERNEL zasum.S)
|
||||
SetFallback(SAXPYKERNEL axpy.S)
|
||||
SetFallback(DAXPYKERNEL axpy.S)
|
||||
SetFallback(CAXPYKERNEL zaxpy.S)
|
||||
SetFallback(ZAXPYKERNEL zaxpy.S)
|
||||
SetFallback(QAXPYKERNEL axpy.S)
|
||||
SetFallback(XAXPYKERNEL zaxpy.S)
|
||||
SetFallback(SCOPYKERNEL copy.S)
|
||||
SetFallback(DCOPYKERNEL copy.S)
|
||||
SetFallback(CCOPYKERNEL zcopy.S)
|
||||
SetFallback(ZCOPYKERNEL zcopy.S)
|
||||
SetFallback(QCOPYKERNEL copy.S)
|
||||
SetFallback(XCOPYKERNEL zcopy.S)
|
||||
SetFallback(SDOTKERNEL dot.S)
|
||||
SetFallback(DDOTKERNEL dot.S)
|
||||
SetFallback(CDOTKERNEL zdot.S)
|
||||
SetFallback(ZDOTKERNEL zdot.S)
|
||||
SetFallback(QDOTKERNEL dot.S)
|
||||
SetFallback(XDOTKERNEL zdot.S)
|
||||
SetFallback(SNRM2KERNEL nrm2.S)
|
||||
SetFallback(DNRM2KERNEL nrm2.S)
|
||||
SetFallback(QNRM2KERNEL nrm2.S)
|
||||
SetFallback(CNRM2KERNEL znrm2.S)
|
||||
SetFallback(ZNRM2KERNEL znrm2.S)
|
||||
SetFallback(XNRM2KERNEL znrm2.S)
|
||||
SetFallback(SROTKERNEL rot.S)
|
||||
SetFallback(DROTKERNEL rot.S)
|
||||
SetFallback(QROTKERNEL rot.S)
|
||||
SetFallback(CROTKERNEL zrot.S)
|
||||
SetFallback(ZROTKERNEL zrot.S)
|
||||
SetFallback(XROTKERNEL zrot.S)
|
||||
SetFallback(SSCALKERNEL scal.S)
|
||||
SetFallback(DSCALKERNEL scal.S)
|
||||
SetFallback(CSCALKERNEL zscal.S)
|
||||
SetFallback(ZSCALKERNEL zscal.S)
|
||||
SetFallback(QSCALKERNEL scal.S)
|
||||
SetFallback(XSCALKERNEL zscal.S)
|
||||
SetFallback(SSWAPKERNEL swap.S)
|
||||
SetFallback(DSWAPKERNEL swap.S)
|
||||
SetFallback(CSWAPKERNEL zswap.S)
|
||||
SetFallback(ZSWAPKERNEL zswap.S)
|
||||
SetFallback(QSWAPKERNEL swap.S)
|
||||
SetFallback(XSWAPKERNEL zswap.S)
|
||||
SetFallback(SGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(SGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(DGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(DGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(CGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(CGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(ZGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(ZGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(QGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(QGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(XGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(XGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(SCABS_KERNEL ../generic/cabs.c)
|
||||
SetFallback(DCABS_KERNEL ../generic/cabs.c)
|
||||
SetFallback(QCABS_KERNEL ../generic/cabs.c)
|
||||
SetFallback(LSAME_KERNEL ../generic/lsame.c)
|
||||
SetFallback(SAXPBYKERNEL ../arm/axpby.c)
|
||||
SetFallback(DAXPBYKERNEL ../arm/axpby.c)
|
||||
SetFallback(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||
SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||
SetFallback(SSUMKERNEL sum.S)
|
||||
SetFallback(DSUMKERNEL sum.S)
|
||||
SetFallback(CSUMKERNEL zsum.S)
|
||||
SetFallback(ZSUMKERNEL zsum.S)
|
||||
SetFallback(QSUMKERNEL sum.S)
|
||||
SetFallback(XSUMKERNEL zsum.S)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SHAMINKERNEL ../arm/amin.c)
|
||||
set(SHAMAXKERNEL ../arm/amax.c)
|
||||
set(SHMAXKERNEL ../arm/max.c)
|
||||
set(SHMINKERNEL ../arm/min.c)
|
||||
set(ISHAMAXKERNEL ../arm/iamax.c)
|
||||
set(ISHAMINKERNEL ../arm/iamin.c)
|
||||
set(ISHMAXKERNEL ../arm/imax.c)
|
||||
set(ISHMINKERNEL ../arm/imin.c)
|
||||
set(SHASUMKERNEL ../arm/asum.c)
|
||||
set(SHAXPYKERNEL ../arm/axpy.c)
|
||||
set(SHAXPBYKERNEL ../arm/axpby.c)
|
||||
set(SHCOPYKERNEL ../arm/copy.c)
|
||||
set(SBDOTKERNEL ../x86_64/sbdot.c)
|
||||
set(SHROTKERNEL ../arm/rot.c)
|
||||
set(SHSCALKERNEL ../arm/scal.c)
|
||||
set(SHNRM2KERNEL ../arm/nrm2.c)
|
||||
set(SHSUMKERNEL ../arm/sum.c)
|
||||
set(SHSWAPKERNEL ../arm/swap.c)
|
||||
set(TOBF16KERNEL ../x86_64/tobf16.c)
|
||||
set(BF16TOKERNEL ../x86_64/bf16to.c)
|
||||
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
SetFallback(SHAMINKERNEL ../arm/amin.c)
|
||||
SetFallback(SHAMAXKERNEL ../arm/amax.c)
|
||||
SetFallback(SHMAXKERNEL ../arm/max.c)
|
||||
SetFallback(SHMINKERNEL ../arm/min.c)
|
||||
SetFallback(ISHAMAXKERNEL ../arm/iamax.c)
|
||||
SetFallback(ISHAMINKERNEL ../arm/iamin.c)
|
||||
SetFallback(ISHMAXKERNEL ../arm/imax.c)
|
||||
SetFallback(ISHMINKERNEL ../arm/imin.c)
|
||||
SetFallback(SHASUMKERNEL ../arm/asum.c)
|
||||
SetFallback(SHAXPYKERNEL ../arm/axpy.c)
|
||||
SetFallback(SHAXPBYKERNEL ../arm/axpby.c)
|
||||
SetFallback(SHCOPYKERNEL ../arm/copy.c)
|
||||
SetFallback(SBDOTKERNEL ../x86_64/sbdot.c)
|
||||
SetFallback(SHROTKERNEL ../arm/rot.c)
|
||||
SetFallback(SHSCALKERNEL ../arm/scal.c)
|
||||
SetFallback(SHNRM2KERNEL ../arm/nrm2.c)
|
||||
SetFallback(SHSUMKERNEL ../arm/sum.c)
|
||||
SetFallback(SHSWAPKERNEL ../arm/swap.c)
|
||||
SetFallback(TOBF16KERNEL ../x86_64/tobf16.c)
|
||||
SetFallback(BF16TOKERNEL ../x86_64/bf16to.c)
|
||||
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL2)
|
||||
set(SGEMVNKERNEL ../arm/gemv_n.c)
|
||||
set(SGEMVTKERNEL ../arm/gemv_t.c)
|
||||
set(DGEMVNKERNEL gemv_n.S)
|
||||
set(DGEMVTKERNEL gemv_t.S)
|
||||
set(CGEMVNKERNEL zgemv_n.S)
|
||||
set(CGEMVTKERNEL zgemv_t.S)
|
||||
set(ZGEMVNKERNEL zgemv_n.S)
|
||||
set(ZGEMVTKERNEL zgemv_t.S)
|
||||
set(QGEMVNKERNEL gemv_n.S)
|
||||
set(QGEMVTKERNEL gemv_t.S)
|
||||
set(XGEMVNKERNEL zgemv_n.S)
|
||||
set(XGEMVTKERNEL zgemv_t.S)
|
||||
set(SGERKERNEL ../generic/ger.c)
|
||||
set(DGERKERNEL ../generic/ger.c)
|
||||
set(QGERKERNEL ../generic/ger.c)
|
||||
set(CGERUKERNEL ../generic/zger.c)
|
||||
set(CGERCKERNEL ../generic/zger.c)
|
||||
set(ZGERUKERNEL ../generic/zger.c)
|
||||
set(ZGERCKERNEL ../generic/zger.c)
|
||||
set(XGERUKERNEL ../generic/zger.c)
|
||||
set(XGERCKERNEL ../generic/zger.c)
|
||||
set(SSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
set(SSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
set(DSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
set(DSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
set(QSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
set(QSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
set(CSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
set(CSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
set(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
set(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
set(XSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
set(XSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
set(CHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
set(CHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(CHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(CHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(SGEMVNKERNEL ../arm/gemv_n.c)
|
||||
SetFallback(SGEMVTKERNEL ../arm/gemv_t.c)
|
||||
SetFallback(DGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(DGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(CGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(CGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(ZGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(ZGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(QGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(QGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(XGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(XGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(SGERKERNEL ../generic/ger.c)
|
||||
SetFallback(DGERKERNEL ../generic/ger.c)
|
||||
SetFallback(QGERKERNEL ../generic/ger.c)
|
||||
SetFallback(CGERUKERNEL ../generic/zger.c)
|
||||
SetFallback(CGERCKERNEL ../generic/zger.c)
|
||||
SetFallback(ZGERUKERNEL ../generic/zger.c)
|
||||
SetFallback(ZGERCKERNEL ../generic/zger.c)
|
||||
SetFallback(XGERUKERNEL ../generic/zger.c)
|
||||
SetFallback(XGERCKERNEL ../generic/zger.c)
|
||||
SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
set(SHGERKERNEL ../generic/ger.c)
|
||||
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
SetFallback(SHGERKERNEL ../generic/ger.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL3)
|
||||
set(SGEADD_KERNEL ../generic/geadd.c)
|
||||
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
SetFallback(SGEADD_KERNEL ../generic/geadd.c)
|
||||
SetFallback(DGEADD_KERNEL ../generic/geadd.c)
|
||||
SetFallback(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SHGEADD_KERNEL ../generic/geadd.c)
|
||||
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||
set(SBGEMM_BETA ../generic/gemm_beta.c)
|
||||
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
|
||||
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
|
||||
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
|
||||
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
|
||||
SetFallback(SHGEADD_KERNEL ../generic/geadd.c)
|
||||
SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||
SetFallback(SBGEMM_BETA ../generic/gemm_beta.c)
|
||||
SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
|
||||
SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
|
||||
SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
|
||||
SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
|
||||
SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o)
|
||||
SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
|
||||
SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
|
||||
SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
|
||||
endif ()
|
||||
|
||||
endmacro ()
|
||||
|
|
|
|||
|
|
@ -416,7 +416,7 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "VORTEX")
|
||||
elseif ("${TCORE}" STREQUAL "VORTEX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
|
|
@ -439,6 +439,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX")
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "P5600")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L2_SIZE 1048576\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" MATCHES "MIPS")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L2_SIZE 262144\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ endif ()
|
|||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
set(NO_AVX 1)
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE")
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS")
|
||||
set(TARGET "NEHALEM")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
|
|
@ -42,6 +42,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
|||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
|
||||
set(TARGET "ARMV7")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10")
|
||||
set(TARGET "POWER6")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
|
|
@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc)
|
|||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -static")
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
set(NO_WARMUP 1)
|
||||
set(HAVE_GAS 1)
|
||||
if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU")
|
||||
set(HAVE_GAS 0)
|
||||
elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as")
|
||||
set(HAVE_GAS 0)
|
||||
endif ()
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}")
|
||||
endif ()
|
||||
|
||||
#if don't use Fortran, it will only compile CBLAS.
|
||||
if (ONLY_CBLAS)
|
||||
set(NO_LAPACK 1)
|
||||
|
|
@ -163,6 +178,22 @@ if (DEFINED TARGET)
|
|||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
|
|
@ -206,6 +237,27 @@ if (DEFINED TARGET)
|
|||
if (DEFINED HAVE_SSE4_1)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
|
||||
endif()
|
||||
|
||||
if (${TARGET} STREQUAL POWER10)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL POWER9)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL POWER8)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
|
|
@ -223,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
|
|||
# C Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||
|
||||
if (INTERFACE64)
|
||||
set(SUFFIX64 64)
|
||||
set(SUFFIX64_UNDERSCORE _64)
|
||||
endif()
|
||||
|
||||
if (NOT NOFORTRAN)
|
||||
# Fortran Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||
|
|
@ -258,7 +315,7 @@ if (NEED_PIC)
|
|||
endif()
|
||||
endif ()
|
||||
|
||||
if (X86_64)
|
||||
if (X86_64 OR ${CORE} STREQUAL POWER10)
|
||||
set(SMALL_MATRIX_OPT TRUE)
|
||||
endif ()
|
||||
if (SMALL_MATRIX_OPT)
|
||||
|
|
@ -266,7 +323,7 @@ if (SMALL_MATRIX_OPT)
|
|||
endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||
if (X86 OR X86_64 OR ARM64 OR POWER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
|
|
|
|||
|
|
@ -20,11 +20,11 @@ endif()
|
|||
|
||||
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
|
||||
if(MINGW)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
|
||||
OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
|
||||
if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
|
||||
set(MINGW64 1)
|
||||
endif()
|
||||
endif()
|
||||
|
|
@ -35,7 +35,7 @@ if(CMAKE_CL_64 OR MINGW64)
|
|||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
set(PPC 1)
|
||||
set(POWER 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
||||
|
|
@ -73,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING})
|
|||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*")
|
||||
set(MIPS32 1)
|
||||
elseif (${TARGET} STREQUAL "ARMV7")
|
||||
set(ARM 1)
|
||||
else()
|
||||
|
|
@ -86,8 +88,12 @@ if (X86_64)
|
|||
set(ARCH "x86_64")
|
||||
elseif(X86)
|
||||
set(ARCH "x86")
|
||||
elseif(PPC)
|
||||
elseif(POWER)
|
||||
set(ARCH "power")
|
||||
elseif(MIPS32)
|
||||
set(ARCH "mips")
|
||||
elseif(MIPS64)
|
||||
set(ARCH "mips64")
|
||||
elseif(ARM)
|
||||
set(ARCH "arm")
|
||||
elseif(ARM64)
|
||||
|
|
@ -97,7 +103,7 @@ else()
|
|||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
|||
|
|
@ -15,35 +15,83 @@ endfunction ()
|
|||
# Reads a Makefile into CMake vars.
|
||||
macro(ParseMakefileVars MAKEFILE_IN)
|
||||
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
set (C_COMPILER ${CMAKE_C_COMPILER_ID})
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
set (SkipIfs 0)
|
||||
set (SkipElse 0)
|
||||
file(STRINGS ${MAKEFILE_IN} makefile_contents)
|
||||
foreach (makefile_line ${makefile_contents})
|
||||
#message(STATUS "parsing ${makefile_line}")
|
||||
if (${IfElse} GREATER 0)
|
||||
#message(STATUS "parsing ${makefile_line}")
|
||||
# Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition.
|
||||
# The variable SkipIfs is used to identify which endif statement closes the scope of the else statement.
|
||||
if (${SkipElse} EQUAL 1)
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
MATH(EXPR SkipIfs "${SkipIfs}+1")
|
||||
endif ()
|
||||
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "ENDIF ${makefile_line}")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
if (${SkipIfs} EQUAL 0)
|
||||
set (SkipElse 0)
|
||||
else ()
|
||||
MATH(EXPR SkipIfs "${SkipIfs}-1")
|
||||
endif ()
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
# The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement.
|
||||
if (${IfElse} GREATER 0)
|
||||
# If the current scope is the one that has to be skipped, the if/endif/else statements
|
||||
# along with it till the endif that closes the current scope have to be ignored as well.
|
||||
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
MATH(EXPR SkipIfs "${SkipIfs}+1")
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
if (${SkipIfs} EQUAL 0)
|
||||
#message(STATUS "ENDIF ${makefile_line}")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
else ()
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
MATH(EXPR SkipIfs "${SkipIfs}-1")
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "ELSE ${makefile_line}")
|
||||
set (ElseSeen 1)
|
||||
continue ()
|
||||
endif()
|
||||
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
|
||||
# message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
if (${SkipIfs} EQUAL 0)
|
||||
#message(STATUS "ELSE ${makefile_line}")
|
||||
set (ElseSeen 1)
|
||||
else ()
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
endif ()
|
||||
continue ()
|
||||
endif()
|
||||
# Skip the lines that are not part of the path that has to be taken.
|
||||
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0))
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
# Skip commented lines (the ones that start with '#')
|
||||
string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
endif ()
|
||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
set(var_name ${CMAKE_MATCH_1})
|
||||
# set(var_value ${CMAKE_MATCH_2})
|
||||
#set(var_value ${CMAKE_MATCH_2})
|
||||
string(STRIP ${CMAKE_MATCH_2} var_value)
|
||||
# check for Makefile variables in the string, e.g. $(TSUFFIX)
|
||||
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
|
||||
|
|
@ -54,39 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value})
|
||||
endforeach ()
|
||||
set(${var_name} ${var_value})
|
||||
else ()
|
||||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on include ${line_match}")
|
||||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
|
||||
else ()
|
||||
# message(STATUS "unmatched line ${line_match}")
|
||||
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
# Include a new file to be parsed
|
||||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on include ${line_match}")
|
||||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
|
||||
continue ()
|
||||
endif ()
|
||||
# The if statement that precedes this else has the path taken
|
||||
# Thus, this else statement has to be skipped.
|
||||
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
set (SkipElse 1)
|
||||
continue()
|
||||
endif()
|
||||
# Example 1: ifdef HAVE_MSA
|
||||
# Example 2: ifndef ZNRM2KERNEL
|
||||
string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}")
|
||||
set (ElseSeen 0)
|
||||
if (DEFINED ${CMAKE_MATCH_2})
|
||||
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER)
|
||||
set (CMAKE_MATCH_1 CMAKE_C_COMPILER)
|
||||
endif ()
|
||||
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
endif ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
else ()
|
||||
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
|
||||
set (IfElse 2)
|
||||
else ()
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
endif ()
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
# Example 1: ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
# Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||
# Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
|
||||
# Ignore the second group since (?:...) does not work on cmake
|
||||
string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}")
|
||||
if (DEFINED ${CMAKE_MATCH_1})
|
||||
if (DEFINED ${CMAKE_MATCH_4})
|
||||
set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}})
|
||||
else ()
|
||||
set (STR ${${CMAKE_MATCH_1}})
|
||||
endif ()
|
||||
if (${STR} STREQUAL ${CMAKE_MATCH_5})
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
set (IfElse 2)
|
||||
continue ()
|
||||
endif ()
|
||||
# Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
# Example 2 (Group 4): ifneq ($(C_COMPILER), PGI)
|
||||
string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}")
|
||||
set (ElseSeen 0)
|
||||
set (HasValidGroup 0)
|
||||
if (DEFINED ${CMAKE_MATCH_3})
|
||||
set (HasValidGroup 1)
|
||||
set (STR ${${CMAKE_MATCH_3}})
|
||||
elseif (NOT ${CMAKE_MATCH_4} STREQUAL "")
|
||||
set (HasValidGroup 1)
|
||||
set (STR ${CMAKE_MATCH_4})
|
||||
endif ()
|
||||
if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1)
|
||||
if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR}))
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
set (IfElse 2)
|
||||
continue ()
|
||||
endif ()
|
||||
#message(STATUS "unmatched line ${line_match}")
|
||||
endforeach ()
|
||||
endmacro ()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,13 +1,14 @@
|
|||
include ../Makefile.rule
|
||||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
all :: dgemv_tester dgemm_tester
|
||||
|
||||
dgemv_tester :
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
|
||||
./dgemv_tester
|
||||
|
||||
dgemm_tester : dgemv_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
|
||||
./dgemm_tester
|
||||
|
||||
clean ::
|
||||
|
|
|
|||
3
cpuid.h
3
cpuid.h
|
|
@ -120,6 +120,7 @@
|
|||
#define CORE_SKYLAKEX 28
|
||||
#define CORE_DHYANA 29
|
||||
#define CORE_COOPERLAKE 30
|
||||
#define CORE_SAPPHIRERAPIDS 31
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
|
|
@ -145,6 +146,7 @@
|
|||
#define HAVE_AVX512VL (1 << 21)
|
||||
#define HAVE_AVX2 (1 << 22)
|
||||
#define HAVE_AVX512BF16 (1 << 23)
|
||||
#define HAVE_AMXBF16 (1 << 24)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
|
|
@ -222,6 +224,7 @@ typedef struct {
|
|||
#define CPUTYPE_SKYLAKEX 52
|
||||
#define CPUTYPE_DHYANA 53
|
||||
#define CPUTYPE_COOPERLAKE 54
|
||||
#define CPUTYPE_SAPPHIRERAPIDS 55
|
||||
|
||||
#define CPUTYPE_HYGON_UNKNOWN 99
|
||||
|
||||
|
|
|
|||
305
cpuid_arm64.c
305
cpuid_arm64.c
|
|
@ -26,10 +26,12 @@
|
|||
*****************************************************************************/
|
||||
|
||||
#include <string.h>
|
||||
#ifdef OS_DARWIN
|
||||
#ifdef __APPLE__
|
||||
#include <sys/sysctl.h>
|
||||
int32_t value;
|
||||
size_t length=sizeof(value);
|
||||
int64_t value64;
|
||||
size_t length64=sizeof(value64);
|
||||
#endif
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
|
|
@ -53,6 +55,8 @@ size_t length=sizeof(value);
|
|||
#define CPU_EMAG8180 10
|
||||
// Apple
|
||||
#define CPU_VORTEX 13
|
||||
// Fujitsu
|
||||
#define CPU_A64FX 15
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
|
|
@ -69,7 +73,8 @@ static char *cpuname[] = {
|
|||
"NEOVERSEN1",
|
||||
"THUNDERX3T110",
|
||||
"VORTEX",
|
||||
"CORTEXA55"
|
||||
"CORTEXA55",
|
||||
"A64FX"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
|
@ -87,7 +92,8 @@ static char *cpuname_lower[] = {
|
|||
"neoversen1",
|
||||
"thunderx3t110",
|
||||
"vortex",
|
||||
"cortexa55"
|
||||
"cortexa55",
|
||||
"a64fx"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
|
@ -183,6 +189,9 @@ int detect(void)
|
|||
// Ampere
|
||||
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
|
||||
return CPU_EMAG8180;
|
||||
// Fujitsu
|
||||
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
|
||||
return CPU_A64FX;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
|
|
@ -212,9 +221,9 @@ int detect(void)
|
|||
|
||||
}
|
||||
#else
|
||||
#ifdef DARWIN
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
||||
if (value ==131287967) return CPU_VORTEX;
|
||||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
|
||||
#endif
|
||||
return CPU_ARMV8;
|
||||
#endif
|
||||
|
|
@ -265,7 +274,7 @@ int n=0;
|
|||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
#ifdef DARWIN
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
|
||||
printf("#define NUM_CORES %d\n",value);
|
||||
#endif
|
||||
|
|
@ -285,154 +294,166 @@ void get_cpuconfig(void)
|
|||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_CORTEXA53:
|
||||
case CPU_CORTEXA55:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
case CPU_CORTEXA53:
|
||||
case CPU_CORTEXA55:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_CORTEXA57:
|
||||
case CPU_CORTEXA72:
|
||||
case CPU_CORTEXA73:
|
||||
case CPU_CORTEXA57:
|
||||
case CPU_CORTEXA72:
|
||||
case CPU_CORTEXA73:
|
||||
// Common minimum settings for these Arm cores
|
||||
// Can change a lot, but we need to be conservative
|
||||
// TODO: detect info from /sys if possible
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_NEOVERSEN1:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_NEOVERSEN1:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX:
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 16777216\n");
|
||||
printf("#define L2_LINESIZE 128\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
case CPU_THUNDERX:
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 16777216\n");
|
||||
printf("#define L2_LINESIZE 128\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX2T99:
|
||||
printf("#define THUNDERX2T99 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
case CPU_THUNDERX2T99:
|
||||
printf("#define THUNDERX2T99 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_TSV110:
|
||||
printf("#define TSV110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||
printf("#define L1_DATA_SIZE 65536 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||
printf("#define L2_SIZE 524228 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
case CPU_TSV110:
|
||||
printf("#define TSV110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||
printf("#define L1_DATA_SIZE 65536 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||
printf("#define L2_SIZE 524228 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_EMAG8180:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define EMAG8180\n");
|
||||
printf("#define L1_CODE_SIZE 32768\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_EMAG8180:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define EMAG8180\n");
|
||||
printf("#define L1_CODE_SIZE 32768\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX3T110:
|
||||
printf("#define THUNDERX3T110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 524288 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 94371840 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#ifdef DARWIN
|
||||
case CPU_VORTEX:
|
||||
printf("#define VORTEX \n");
|
||||
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0);
|
||||
printf("#define L1_CODE_SIZE %d \n",value);
|
||||
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0);
|
||||
printf("#define L1_CODE_LINESIZE %d \n",value);
|
||||
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L1_DATA_SIZE %d \n",value);
|
||||
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L2_SIZE %d \n",value);
|
||||
break;
|
||||
case CPU_THUNDERX3T110:
|
||||
printf("#define THUNDERX3T110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 524288 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 94371840 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#ifdef __APPLE__
|
||||
case CPU_VORTEX:
|
||||
printf("#define VORTEX \n");
|
||||
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_CODE_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_CODE_LINESIZE %lld \n",value64);
|
||||
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_DATA_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L2_SIZE %lld \n",value64);
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#endif
|
||||
case CPU_A64FX:
|
||||
printf("#define A64FX\n");
|
||||
printf("#define L1_CODE_SIZE 65535\n");
|
||||
printf("#define L1_DATA_SIZE 65535\n");
|
||||
printf("#define L1_DATA_LINESIZE 256\n");
|
||||
printf("#define L2_SIZE 8388608\n");
|
||||
printf("#define L2_LINESIZE 256\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
|
|
|||
36
cpuid_mips.c
36
cpuid_mips.c
|
|
@ -165,6 +165,7 @@ void get_cpuconfig(void){
|
|||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
if (!get_feature(msa)) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
|
@ -178,3 +179,38 @@ void get_libname(void){
|
|||
printf("mips\n");
|
||||
}
|
||||
}
|
||||
|
||||
int get_feature(char *search)
|
||||
{
|
||||
|
||||
#ifdef __linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("Features", buffer, 8))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
{
|
||||
if (!strcmp(t, search)) { return(1); }
|
||||
}
|
||||
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -104,17 +104,17 @@ int detect(void){
|
|||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
|
||||
return CPU_LOONGSON3R3;
|
||||
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
|
||||
return CPU_LOONGSON3R4;
|
||||
} else{
|
||||
return CPU_SICORTEX;
|
||||
if (p != NULL){
|
||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
|
||||
return CPU_LOONGSON3R3;
|
||||
} else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
|
||||
return CPU_LOONGSON3R4;
|
||||
} else{
|
||||
return CPU_SICORTEX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
char *get_corename(void){
|
||||
|
|
@ -201,6 +201,7 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}
|
||||
if (!get_feature(msa)) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
|
@ -218,3 +219,38 @@ void get_libname(void){
|
|||
printf("mips64\n");
|
||||
}
|
||||
}
|
||||
|
||||
int get_feature(char *search)
|
||||
{
|
||||
|
||||
#ifdef __linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("Features", buffer, 8))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
{
|
||||
if (!strcmp(t, search)) { return(1); }
|
||||
}
|
||||
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
|
|||
217
cpuid_x86.c
217
cpuid_x86.c
|
|
@ -1,3 +1,4 @@
|
|||
//{
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
|
|
@ -266,6 +267,31 @@ int support_avx512_bf16(){
|
|||
#endif
|
||||
}
|
||||
|
||||
#define BIT_AMX_TILE 0x01000000
|
||||
#define BIT_AMX_BF16 0x00400000
|
||||
#define BIT_AMX_ENBD 0x00060000
|
||||
|
||||
int support_amx_bf16() {
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx512())
|
||||
return 0;
|
||||
// CPUID.7.0:EDX indicates AMX support
|
||||
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
|
||||
// CPUID.D.0:EAX[17:18] indicates AMX enabled
|
||||
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
|
||||
ret = 1;
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
char vendor[13];
|
||||
|
|
@ -353,6 +379,7 @@ int get_cputype(int gettype){
|
|||
if (support_avx2()) feature |= HAVE_AVX2;
|
||||
if (support_avx512()) feature |= HAVE_AVX512VL;
|
||||
if (support_avx512_bf16()) feature |= HAVE_AVX512BF16;
|
||||
if (support_amx_bf16()) feature |= HAVE_AMXBF16;
|
||||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
||||
#endif
|
||||
|
||||
|
|
@ -1429,10 +1456,10 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 12: // Tiger Lake
|
||||
case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz)
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
|
|
@ -1448,30 +1475,70 @@ int get_cpuname(void){
|
|||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
case 10: //family 6 exmodel 10
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7: // Rocket Lake
|
||||
if(support_avx512())
|
||||
case 15: // Sapphire Rapids
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
switch (model) {
|
||||
case 7: // Alder Lake desktop
|
||||
case 10: // Alder Lake mobile
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 13: // Ice Lake NNPI
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 10: //family 6 exmodel 10
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7: // Rocket Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 0x7:
|
||||
return CPUTYPE_ITANIUM;
|
||||
case 0xf:
|
||||
|
|
@ -2042,32 +2109,7 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 10:
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
#endif
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
#endif
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
case 5:
|
||||
switch (model) {
|
||||
case 6:
|
||||
|
|
@ -2121,6 +2163,7 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
case 6:
|
||||
if (model == 6)
|
||||
#ifndef NO_AVX512
|
||||
|
|
@ -2135,7 +2178,7 @@ int get_coretype(void){
|
|||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
if (model == 10)
|
||||
if (model == 10 || model == 12)
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
|
|
@ -2151,10 +2194,11 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case 7:
|
||||
if (model == 10)
|
||||
return CORE_NEHALEM;
|
||||
if (model == 14)
|
||||
if (model == 13 || model == 14) // Ice Lake
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
|
|
@ -2168,9 +2212,9 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 9:
|
||||
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if (model == 12 || model == 13) { // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
|
|
@ -2180,7 +2224,7 @@ int get_coretype(void){
|
|||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake
|
||||
if (model == 14) { // Kaby Lake mobile
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
|
|
@ -2190,12 +2234,82 @@ int get_coretype(void){
|
|||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 15) { // Sapphire Rapids
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
case 9:
|
||||
if (model == 7 || model == 10) { // Alder Lake
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 13) { // Ice Lake NNPI
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake desktop
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
case 10:
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
#endif
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
#endif
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
case 15:
|
||||
if (model <= 0x2) return CORE_NORTHWOOD;
|
||||
else return CORE_PRESCOTT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2389,6 +2503,7 @@ void get_cpuconfig(void){
|
|||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
|
||||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
|
||||
if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n");
|
||||
if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n");
|
||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||
|
|
@ -2460,9 +2575,11 @@ void get_sse(void){
|
|||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
|
||||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
|
||||
if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n");
|
||||
if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n");
|
||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
|
||||
|
||||
}
|
||||
//}
|
||||
|
|
@ -27,57 +27,11 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
#include "cpuid_zarch.h"
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
{
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Type", buffer, 4)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
void get_libname(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,101 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
// Guard the use of getauxval() on glibc version >= 2.16
|
||||
#ifdef __GLIBC__
|
||||
#include <features.h>
|
||||
#if __GLIBC_PREREQ(2, 16)
|
||||
#include <sys/auxv.h>
|
||||
#define HAVE_GETAUXVAL 1
|
||||
|
||||
static unsigned long get_hwcap(void)
|
||||
{
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
char *maskenv;
|
||||
|
||||
// honor requests for not using specific CPU features in LD_HWCAP_MASK
|
||||
maskenv = getenv("LD_HWCAP_MASK");
|
||||
if (maskenv)
|
||||
hwcap &= strtoul(maskenv, NULL, 0);
|
||||
|
||||
return hwcap;
|
||||
// note that a missing auxval is interpreted as no capabilities
|
||||
// available, which is safe.
|
||||
}
|
||||
|
||||
#else // __GLIBC_PREREQ(2, 16)
|
||||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
|
||||
|
||||
static unsigned long get_hwcap(void) {
|
||||
// treat missing support for getauxval() as no capabilities available,
|
||||
// which is safe.
|
||||
return 0;
|
||||
}
|
||||
#endif // __GLIBC_PREREQ(2, 16)
|
||||
#endif // __GLIBC
|
||||
|
||||
static int detect(void)
|
||||
{
|
||||
unsigned long hwcap = get_hwcap();
|
||||
|
||||
// Choose the architecture level for optimized kernels based on hardware
|
||||
// capability bits (just like glibc chooses optimized implementations).
|
||||
//
|
||||
// The hardware capability bits that are used here indicate both
|
||||
// hardware support for a particular ISA extension and the presence of
|
||||
// software support to enable its use. For example, when HWCAP_S390_VX
|
||||
// is set then both the CPU can execute SIMD instructions and the Linux
|
||||
// kernel can manage applications using the vector registers and SIMD
|
||||
// instructions.
|
||||
//
|
||||
// See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in
|
||||
// sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware
|
||||
// capability bits. They are derived from the information that the
|
||||
// "store facility list (extended)" instructions provide.
|
||||
// (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD)
|
||||
//
|
||||
// currently used:
|
||||
// HWCAP_S390_VX - vector facility for z/Architecture (introduced with
|
||||
// IBM z13), enables level CPU_Z13 (SIMD)
|
||||
// HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM
|
||||
// z14), together with VX enables level CPU_Z14
|
||||
// (single-precision SIMD instructions)
|
||||
//
|
||||
// When you add optimized kernels that make use of other ISA extensions
|
||||
// (e.g., for exploiting the vector-enhancements facility 2 that was introduced
|
||||
// with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate
|
||||
// it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2
|
||||
// for the z15 vector enhancements).
|
||||
//
|
||||
// To learn the value of hwcaps on a given system, set the environment
|
||||
// variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running
|
||||
// LD_SHOW_AUXV=1 /bin/true).
|
||||
// Also, the init function for dynamic arch support will print hwcaps
|
||||
// when OPENBLAS_VERBOSE is set to 2 or higher.
|
||||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
|
||||
return CPU_Z14;
|
||||
|
||||
if (hwcap & HWCAP_S390_VX)
|
||||
return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
|
|
@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#else
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
/* Split local region of B into parts */
|
||||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, js + div_n) - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
||||
min_jj = ls - js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
||||
min_jj = js - ls - min_l - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 ""
|
|||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
list(APPEND COMMON_SOURCES dynamic_arm64.c)
|
||||
elseif (POWER)
|
||||
list(APPEND COMMON_SOURCES dynamic_power.c)
|
||||
else ()
|
||||
list(APPEND COMMON_SOURCES dynamic.c)
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@
|
|||
#include <stdlib.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(OS_CYGWIN_NT) && !defined(unlikely)
|
||||
#if !defined(unlikely)
|
||||
#ifdef __GNUC__
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#else
|
||||
|
|
@ -391,8 +391,9 @@ int blas_thread_init(void){
|
|||
|
||||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||
|
||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||
#if defined(SMP_SERVER)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
// on Cygwin or as delayed init when a static library is used
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -624,7 +624,7 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 10) {
|
||||
if (model == 10 || model == 12){
|
||||
// Ice Lake SP
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
|
|
@ -639,12 +639,12 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont Plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
if (model == 13 || model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
|
|
@ -661,9 +661,8 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if (model == 12 || model == 13) { // Tiger Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
|
|
@ -689,6 +688,50 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 15){ // Sapphire Rapids
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
|
||||
case 9:
|
||||
if (model == 7 || model == 10) { // Alder Lake
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 10:
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
|
|
@ -1018,7 +1061,13 @@ void gotoblas_dynamic_init(void) {
|
|||
#ifdef ARCH_X86
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
|
||||
#else
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
|
||||
if (gotoblas == NULL) {
|
||||
if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE;
|
||||
else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX;
|
||||
else if (support_avx2()) gotoblas = &gotoblas_HASWELL;
|
||||
else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE;
|
||||
else gotoblas = &gotoblas_PRESCOTT;
|
||||
}
|
||||
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
|
||||
if (sizeof(void*) == 8) {
|
||||
if (gotoblas == &gotoblas_KATMAI ||
|
||||
|
|
|
|||
|
|
@ -1,38 +1,7 @@
|
|||
#include "common.h"
|
||||
#include "cpuid_zarch.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
// Guard the use of getauxval() on glibc version >= 2.16
|
||||
#ifdef __GLIBC__
|
||||
#include <features.h>
|
||||
#if __GLIBC_PREREQ(2, 16)
|
||||
#include <sys/auxv.h>
|
||||
#define HAVE_GETAUXVAL 1
|
||||
|
||||
static unsigned long get_hwcap(void)
|
||||
{
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
char *maskenv;
|
||||
|
||||
// honor requests for not using specific CPU features in LD_HWCAP_MASK
|
||||
maskenv = getenv("LD_HWCAP_MASK");
|
||||
if (maskenv)
|
||||
hwcap &= strtoul(maskenv, NULL, 0);
|
||||
|
||||
return hwcap;
|
||||
// note that a missing auxval is interpreted as no capabilities
|
||||
// available, which is safe.
|
||||
}
|
||||
|
||||
#else // __GLIBC_PREREQ(2, 16)
|
||||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
|
||||
|
||||
static unsigned long get_hwcap(void) {
|
||||
// treat missing support for getauxval() as no capabilities available,
|
||||
// which is safe.
|
||||
return 0;
|
||||
}
|
||||
#endif // __GLIBC_PREREQ(2, 16)
|
||||
#endif // __GLIBC
|
||||
|
||||
extern gotoblas_t gotoblas_ZARCH_GENERIC;
|
||||
#ifdef DYN_Z13
|
||||
|
|
@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14;
|
|||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
extern int openblas_verbose();
|
||||
extern void openblas_warning(int verbose, const char* msg);
|
||||
|
||||
static char* corename[] = {
|
||||
"unknown",
|
||||
"Z13",
|
||||
"Z14",
|
||||
"ZARCH_GENERIC",
|
||||
};
|
||||
|
||||
char* gotoblas_corename(void) {
|
||||
#ifdef DYN_Z13
|
||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||
if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13];
|
||||
#endif
|
||||
#ifdef DYN_Z14
|
||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||
if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
|
||||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC];
|
||||
|
||||
return corename[0];
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
#ifndef HWCAP_S390_VXE
|
||||
|
|
@ -79,25 +42,28 @@ char* gotoblas_corename(void) {
|
|||
*/
|
||||
static gotoblas_t* get_coretype(void) {
|
||||
|
||||
unsigned long hwcap __attribute__((unused)) = get_hwcap();
|
||||
int cpu = detect();
|
||||
|
||||
#ifdef DYN_Z14
|
||||
switch(cpu) {
|
||||
// z14 and z15 systems: exploit Vector Facility (SIMD) and
|
||||
// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
|
||||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
|
||||
case CPU_Z14:
|
||||
#ifdef DYN_Z14
|
||||
return &gotoblas_Z14;
|
||||
#endif
|
||||
|
||||
#ifdef DYN_Z13
|
||||
// z13: Vector Facility (SIMD for double)
|
||||
if (hwcap & HWCAP_S390_VX)
|
||||
case CPU_Z13:
|
||||
#ifdef DYN_Z13
|
||||
return &gotoblas_Z13;
|
||||
#endif
|
||||
|
||||
default:
|
||||
// fallback in case of missing compiler support, systems before z13, or
|
||||
// when the OS does not advertise support for the Vector Facility (e.g.,
|
||||
// missing support in the OS kernel)
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
}
|
||||
|
||||
static gotoblas_t* force_coretype(char* coretype) {
|
||||
|
|
@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) {
|
|||
|
||||
for (i = 0; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
if (!strncasecmp(coretype, cpuname[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found == 1) {
|
||||
if (found == CPU_Z13) {
|
||||
#ifdef DYN_Z13
|
||||
return &gotoblas_Z13;
|
||||
#else
|
||||
openblas_warning(1, "Z13 support not compiled in");
|
||||
return NULL;
|
||||
#endif
|
||||
} else if (found == 2) {
|
||||
} else if (found == CPU_Z14) {
|
||||
#ifdef DYN_Z14
|
||||
return &gotoblas_Z14;
|
||||
#else
|
||||
openblas_warning(1, "Z14 support not compiled in");
|
||||
return NULL;
|
||||
#endif
|
||||
} else if (found == 3) {
|
||||
} else if (found == CPU_GENERIC) {
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
|
||||
|
|
@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) {
|
|||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
if (openblas_verbose() >= 2) {
|
||||
snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n",
|
||||
getauxval(AT_HWCAP));
|
||||
openblas_warning(2, coremsg);
|
||||
}
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
|
|
@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) {
|
|||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
if (openblas_verbose() >= 2) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
}
|
||||
gotoblas->init();
|
||||
}
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -246,6 +246,14 @@ int get_num_procs(void) {
|
|||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
#if _OPENMP >= 201511
|
||||
nums = omp_get_num_places();
|
||||
#endif
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
|
@ -1806,10 +1814,19 @@ int get_num_procs(void) {
|
|||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
/* if (omp_get_proc_bind() != omp_proc_bind_false) */
|
||||
#if _OPENMP >= 201511
|
||||
nums = omp_get_num_places();
|
||||
#endif
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
|
|
@ -2854,32 +2871,28 @@ void *blas_memory_alloc(int procpos){
|
|||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
if (memory_overflowed) {
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
do {
|
||||
RMB;
|
||||
|
||||
do {
|
||||
RMB;
|
||||
#if defined(USE_OPENMP)
|
||||
if (!newmemory[position-NUM_BUFFERS].used) {
|
||||
blas_lock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
if (!newmemory[position-NUM_BUFFERS].used) {
|
||||
blas_lock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
#endif
|
||||
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
|
||||
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
}
|
||||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
}
|
||||
#endif
|
||||
position ++;
|
||||
position ++;
|
||||
|
||||
} while (position < 512+NUM_BUFFERS);
|
||||
} while (position < 512+NUM_BUFFERS);
|
||||
}
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
goto error;
|
||||
|
||||
allocation :
|
||||
|
|
@ -2904,7 +2917,7 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
|
|
@ -2984,6 +2997,9 @@ void *blas_memory_alloc(int procpos){
|
|||
return (void *)memory[position].addr;
|
||||
|
||||
error:
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
if (memory_overflowed) goto terminate;
|
||||
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
|
||||
memory_overflowed=1;
|
||||
|
|
@ -2997,7 +3013,6 @@ void *blas_memory_alloc(int procpos){
|
|||
newmemory[i].used = 0;
|
||||
newmemory[i].lock = 0;
|
||||
}
|
||||
newmemory[position-NUM_BUFFERS].used = 1;
|
||||
|
||||
allocation2:
|
||||
newmemory[position-NUM_BUFFERS].used = 1;
|
||||
|
|
@ -3015,7 +3030,7 @@ allocation2:
|
|||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
|
|
@ -3069,6 +3084,9 @@ allocation2:
|
|||
return (void *)newmemory[position-NUM_BUFFERS].addr;
|
||||
|
||||
terminate:
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ int get_L2_size(void){
|
|||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
|
||||
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
|
@ -269,7 +269,7 @@ void blas_set_parameter(void){
|
|||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
|
||||
defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
|
||||
defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
|
|
|||
87
getarch.c
87
getarch.c
|
|
@ -469,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_SAPPHIRERAPIDS
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX512
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "SAPPHIRERAPIDS"
|
||||
#define ARCHCONFIG "-DSAPPHIRERAPIDS " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids"
|
||||
#define LIBNAME "sapphirerapids"
|
||||
#define CORENAME "SAPPHIRERAPIDS"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ATOM
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
|
|
@ -964,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ARCHCONFIG "-DP5600 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
|
||||
#define LIBNAME "p5600"
|
||||
#define CORENAME "P5600"
|
||||
#else
|
||||
|
|
@ -978,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ARCHCONFIG "-DMIPS1004K " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
|
||||
#define LIBNAME "mips1004K"
|
||||
#define CORENAME "MIPS1004K"
|
||||
#else
|
||||
|
|
@ -992,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ARCHCONFIG "-DMIPS24K " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=32768 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
|
||||
#define LIBNAME "mips24K"
|
||||
#define CORENAME "MIPS24K"
|
||||
#else
|
||||
|
|
@ -1149,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ARMV8SVE
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "ARMV8SVE"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DARMV8SVE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
|
||||
#define LIBNAME "armv8sve"
|
||||
#define CORENAME "ARMV8SVE"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_ARMV8
|
||||
#define FORCE
|
||||
|
|
@ -1375,6 +1438,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "VORTEX"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_A64FX
|
||||
#define ARMV8
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "A64FX"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DA64FX " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
|
||||
#define LIBNAME "a64fx"
|
||||
#define CORENAME "A64FX"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
|
|
|
|||
|
|
@ -188,12 +188,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (incx == 1 && trans == 0 && n < 50) {
|
||||
buffer = NULL;
|
||||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
|
|
|||
|
|
@ -42,14 +42,20 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){
|
||||
|
||||
BLASLONG n = *N;
|
||||
BLASLONG incx = *INCX;
|
||||
BLASLONG incy = *INCY;
|
||||
FLOAT c = *C;
|
||||
FLOAT s = *S;
|
||||
|
||||
#else
|
||||
void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) {
|
||||
FLOAT *x = (FLOAT*) VX;
|
||||
FLOAT *y = (FLOAT*) VY;
|
||||
#endif /* CBLAS */
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
if (n <= 0) return;
|
||||
|
|
|
|||
|
|
@ -4,8 +4,16 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
|
||||
#else
|
||||
void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
|
||||
FLOAT *DA = (FLOAT*) VDA;
|
||||
FLOAT *DB = (FLOAT*) VDB;
|
||||
FLOAT *S = (FLOAT*) VS;
|
||||
#endif /* CBLAS */
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
|
||||
|
||||
long double da_r = *(DA + 0);
|
||||
|
|
|
|||
|
|
@ -199,12 +199,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (incx == 1 && trans == 0 && n < 50) {
|
||||
buffer = NULL;
|
||||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
|
|
|||
|
|
@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
if (${DYNAMIC_ARCH})
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||
endif ()
|
||||
ParseMakefileVars("${KERNELDIR}/KERNEL")
|
||||
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
|
||||
SetDefaultL1()
|
||||
SetDefaultL2()
|
||||
SetDefaultL3()
|
||||
ParseMakefileVars("${KERNELDIR}/KERNEL")
|
||||
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
|
||||
|
||||
set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h)
|
||||
if(NOT NO_LAPACK)
|
||||
|
|
@ -198,7 +198,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
|
||||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE))
|
||||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
|
||||
|
|
@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
|
||||
|
||||
# symm for s and d
|
||||
if (NOT DEFINED ${float_char}SYMMUCOPY_M)
|
||||
set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
|
||||
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
|
||||
endif()
|
||||
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
|
||||
|
||||
# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define.
|
||||
# Could simplify it a bit by pairing up by -UUNIT/-DUNIT.
|
||||
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
|
||||
if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
|
||||
set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
|
||||
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
|
||||
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
|
||||
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
|
||||
endif ()
|
||||
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
|
|
@ -578,11 +596,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
|
||||
|
|
@ -616,11 +634,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,22 @@ ifdef NO_AVX2
|
|||
endif
|
||||
|
||||
ifdef TARGET_CORE
|
||||
ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
override CFLAGS += -march=sapphirerapids
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
override CFLAGS += -march=cooperlake
|
||||
|
|
|
|||
|
|
@ -47,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SAPPHIRERAPIDS)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
|
@ -1479,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT
|
|||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
|
||||
|
||||
|
||||
ifdef STRMMUNCOPY_M
|
||||
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef STRMMLNCOPY_M
|
||||
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef STRMMUTCOPY_M
|
||||
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef STRMMLTCOPY_M
|
||||
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
|
@ -1527,29 +1563,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N
|
|||
$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
|
||||
|
||||
ifdef DTRMMUNCOPY_M
|
||||
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DTRMMLNCOPY_M
|
||||
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DTRMMUTCOPY_M
|
||||
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DTRMMLTCOPY_M
|
||||
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
|
@ -1773,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).
|
|||
$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
|
||||
|
||||
ifdef SSYMMUCOPY_M
|
||||
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
else
|
||||
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
endif
|
||||
|
||||
ifdef SSYMMLCOPY_M
|
||||
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
else
|
||||
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
|
||||
|
|
@ -1785,11 +1863,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).
|
|||
$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
|
||||
|
||||
ifdef DSYMMUCOPY_M
|
||||
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
else
|
||||
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DSYMMLCOPY_M
|
||||
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
else
|
||||
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
|
||||
|
|
|
|||
|
|
@ -0,0 +1,183 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
|||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
|
@ -169,7 +169,7 @@ endif
|
|||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
|
|
@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
|||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
|||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
|
@ -169,7 +169,7 @@ endif
|
|||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
|
|
@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
|||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV8
|
||||
include $(KERNELDIR)/KERNEL.NEOVERSEN1
|
||||
|
|
|
|||
|
|
@ -0,0 +1,898 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define FMLA_RI "fmla "
|
||||
#define FMLA_IR "fmla "
|
||||
#define FMLA_II "fmls "
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define FMLA_RI "fmls "
|
||||
#define FMLA_IR "fmla "
|
||||
#define FMLA_II "fmla "
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define FMLA_RI "fmla "
|
||||
#define FMLA_IR "fmls "
|
||||
#define FMLA_II "fmla "
|
||||
#else
|
||||
#define FMLA_RI "fmls "
|
||||
#define FMLA_IR "fmls "
|
||||
#define FMLA_II "fmls "
|
||||
#endif
|
||||
#define FMLA_RR "fmla "
|
||||
|
||||
static inline void store_m8n1_contracted(float *C,
|
||||
float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i,
|
||||
float alphar, float alphai) {
|
||||
|
||||
float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8);
|
||||
ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar);
|
||||
ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar);
|
||||
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai);
|
||||
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai);
|
||||
ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai);
|
||||
ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai);
|
||||
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar);
|
||||
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar);
|
||||
vst2q_f32(C, ld1);
|
||||
vst2q_f32(C + 8, ld2);
|
||||
}
|
||||
|
||||
static inline void kernel_8x4(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
const float *c_pref = C;
|
||||
float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i;
|
||||
float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i;
|
||||
|
||||
/** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */
|
||||
/** v0-v1 and v10-v11 for B, v2-v9 for A */
|
||||
__asm__ __volatile__(
|
||||
"cmp %[K],#0; mov %[c_pref],%[C]\n\t"
|
||||
"movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
|
||||
"movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
|
||||
"movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
|
||||
"movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"movi %[c6i].16b,#0\n\t"
|
||||
"movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t"
|
||||
"movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
|
||||
"beq 4f\n\t"
|
||||
"cmp %[K],#2\n\t"
|
||||
"ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t"
|
||||
"ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t"
|
||||
"mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t"
|
||||
"bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t"
|
||||
"bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t"
|
||||
|
||||
"blt 3f; beq 2f\n\t"
|
||||
"1:\n\t"
|
||||
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
|
||||
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
|
||||
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t"
|
||||
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
|
||||
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
|
||||
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
|
||||
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
|
||||
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
|
||||
"fmov v5.d[1],x0; fmov d1,x2\n\t"
|
||||
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t"
|
||||
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t"
|
||||
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
|
||||
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
|
||||
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
|
||||
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t"
|
||||
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
|
||||
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
|
||||
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
|
||||
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
|
||||
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
|
||||
"fmov v7.d[1],x0; fmov d10,x5\n\t"
|
||||
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
|
||||
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t"
|
||||
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
|
||||
"fmov v10.d[1],x6; fmov d11,x2\n\t"
|
||||
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t"
|
||||
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t"
|
||||
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t"
|
||||
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
|
||||
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
|
||||
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t"
|
||||
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t"
|
||||
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
|
||||
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
|
||||
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t"
|
||||
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
|
||||
"fmov v9.d[1],x0; fmov d0,x5\n\t"
|
||||
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t"
|
||||
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t"
|
||||
FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
|
||||
"fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t"
|
||||
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t"
|
||||
FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
|
||||
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t"
|
||||
"fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t"
|
||||
FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
|
||||
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t"
|
||||
FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t"
|
||||
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t"
|
||||
FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t"
|
||||
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t"
|
||||
FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t"
|
||||
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t"
|
||||
FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t"
|
||||
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t"
|
||||
FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
|
||||
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
|
||||
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
|
||||
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
|
||||
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
|
||||
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
|
||||
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
|
||||
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
|
||||
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
|
||||
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
|
||||
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
|
||||
"bgt 1b; blt 3f\n\t"
|
||||
"2:\n\t"
|
||||
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
|
||||
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
|
||||
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t"
|
||||
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
|
||||
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
|
||||
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
|
||||
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
|
||||
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
|
||||
"fmov v5.d[1],x0; fmov d1,x2\n\t"
|
||||
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t"
|
||||
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t"
|
||||
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
|
||||
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
|
||||
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
|
||||
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t"
|
||||
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
|
||||
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
|
||||
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
|
||||
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
|
||||
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
|
||||
"fmov v7.d[1],x0; fmov d10,x5\n\t"
|
||||
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
|
||||
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t"
|
||||
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
|
||||
"fmov v10.d[1],x6; fmov d11,x2\n\t"
|
||||
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t"
|
||||
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
|
||||
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t"
|
||||
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
|
||||
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
|
||||
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t"
|
||||
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
|
||||
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
|
||||
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
|
||||
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t"
|
||||
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
|
||||
"fmov v9.d[1],x0\n\t"
|
||||
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
|
||||
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
|
||||
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
|
||||
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
|
||||
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t"
|
||||
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t"
|
||||
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t"
|
||||
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t"
|
||||
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
|
||||
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
|
||||
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
|
||||
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
|
||||
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
|
||||
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
|
||||
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
|
||||
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
|
||||
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
|
||||
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
|
||||
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
|
||||
"b 4f\n\t"
|
||||
"3:\n\t"
|
||||
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
|
||||
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
|
||||
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t"
|
||||
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
|
||||
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
|
||||
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
|
||||
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t"
|
||||
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
|
||||
"fmov v5.d[1],x0; fmov d1,x2\n\t"
|
||||
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t"
|
||||
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t"
|
||||
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
|
||||
"fmov v1.d[1],x4\n\t"
|
||||
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t"
|
||||
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t"
|
||||
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t"
|
||||
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t"
|
||||
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t"
|
||||
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
|
||||
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t"
|
||||
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
|
||||
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t"
|
||||
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
|
||||
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
|
||||
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
|
||||
"4:\n\t"
|
||||
"mov %[c_pref],%[C]\n\t"
|
||||
"zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
|
||||
"zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
|
||||
"zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
|
||||
"zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t"
|
||||
"zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t"
|
||||
"zip2 %[c2i].2d,v4.2d,v5.2d\n\t"
|
||||
"zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t"
|
||||
"zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t"
|
||||
"zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t"
|
||||
"zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t"
|
||||
"zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t"
|
||||
"zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t"
|
||||
"zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t"
|
||||
"zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t"
|
||||
"zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t"
|
||||
"zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t"
|
||||
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
|
||||
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
|
||||
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
|
||||
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
|
||||
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref)
|
||||
:[C]"r"(C), [LDC]"r"(LDC)
|
||||
:"cc","memory","x0","x1","x2","x3","x4","x5","x6",
|
||||
"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11");
|
||||
|
||||
store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2;
|
||||
store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2;
|
||||
store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2;
|
||||
store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai);
|
||||
}
|
||||
|
||||
static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc,
|
||||
float32x4_t a, float32x4_t b) {
|
||||
|
||||
acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0);
|
||||
acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1);
|
||||
acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2);
|
||||
acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3);
|
||||
return acc;
|
||||
}
|
||||
|
||||
static inline float32x4x4_t expand_alpha(float alphar, float alphai) {
|
||||
float32x4x4_t ret;
|
||||
const float maskp[] = { -1, 1, -1, 1 };
|
||||
const float maskn[] = { 1, -1, 1, -1 };
|
||||
const float32x4_t vrevp = vld1q_f32(maskp);
|
||||
const float32x4_t vrevn = vld1q_f32(maskn);
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
ret.val[0] = vdupq_n_f32(alphar);
|
||||
ret.val[1] = vdupq_n_f32(-alphai);
|
||||
ret.val[2] = vmulq_f32(ret.val[1], vrevn);
|
||||
ret.val[3] = vmulq_f32(ret.val[0], vrevp);
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
ret.val[0] = vdupq_n_f32(alphar);
|
||||
ret.val[1] = vdupq_n_f32(alphai);
|
||||
ret.val[2] = vmulq_f32(ret.val[1], vrevp);
|
||||
ret.val[3] = vmulq_f32(ret.val[0], vrevn);
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
ret.val[2] = vdupq_n_f32(alphai);
|
||||
ret.val[3] = vdupq_n_f32(alphar);
|
||||
ret.val[0] = vmulq_f32(ret.val[3], vrevn);
|
||||
ret.val[1] = vmulq_f32(ret.val[2], vrevp);
|
||||
#else
|
||||
ret.val[2] = vdupq_n_f32(alphai);
|
||||
ret.val[3] = vdupq_n_f32(-alphar);
|
||||
ret.val[0] = vmulq_f32(ret.val[3], vrevp);
|
||||
ret.val[1] = vmulq_f32(ret.val[2], vrevn);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void store_expanded_m2n2(float *C, BLASLONG LDC,
|
||||
float32x4x4_t acc, float32x4x4_t expanded_alpha) {
|
||||
|
||||
float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
|
||||
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]);
|
||||
acc.val[0] = vrev64q_f32(acc.val[0]);
|
||||
acc.val[2] = vrev64q_f32(acc.val[2]);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
|
||||
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]);
|
||||
acc.val[1] = vrev64q_f32(acc.val[1]);
|
||||
acc.val[3] = vrev64q_f32(acc.val[3]);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
|
||||
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
|
||||
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]);
|
||||
vst1q_f32(C, ld1);
|
||||
vst1q_f32(C + LDC * 2, ld2);
|
||||
}
|
||||
|
||||
static inline float32x4x4_t init_expanded_m2n2() {
|
||||
float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0),
|
||||
vdupq_n_f32(0), vdupq_n_f32(0) }};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void kernel_4x4(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
float32x4x4_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m2n2();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
|
||||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
|
||||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4),
|
||||
b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b1);
|
||||
c3 = acc_expanded_m2n2(c3, a1, b2);
|
||||
c4 = acc_expanded_m2n2(c4, a2, b2);
|
||||
c1 = acc_expanded_m2n2(c1, a3, b3);
|
||||
c2 = acc_expanded_m2n2(c2, a4, b3);
|
||||
c3 = acc_expanded_m2n2(c3, a3, b4);
|
||||
c4 = acc_expanded_m2n2(c4, a4, b4);
|
||||
}
|
||||
if (K) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
|
||||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b1);
|
||||
c3 = acc_expanded_m2n2(c3, a1, b2);
|
||||
c4 = acc_expanded_m2n2(c4, a2, b2);
|
||||
}
|
||||
|
||||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
|
||||
store_expanded_m2n2(C, LDC, c1, e_alpha);
|
||||
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
|
||||
C += LDC * 4;
|
||||
store_expanded_m2n2(C, LDC, c3, e_alpha);
|
||||
store_expanded_m2n2(C + 4, LDC, c4, e_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_8x2(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
float32x4x4_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m2n2();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
|
||||
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
|
||||
float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20);
|
||||
float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
|
||||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b1);
|
||||
c3 = acc_expanded_m2n2(c3, a3, b1);
|
||||
c4 = acc_expanded_m2n2(c4, a4, b1);
|
||||
c1 = acc_expanded_m2n2(c1, a5, b2);
|
||||
c2 = acc_expanded_m2n2(c2, a6, b2);
|
||||
c3 = acc_expanded_m2n2(c3, a7, b2);
|
||||
c4 = acc_expanded_m2n2(c4, a8, b2);
|
||||
}
|
||||
if (K) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
|
||||
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
|
||||
float32x4_t b1 = vld1q_f32(sb);
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b1);
|
||||
c3 = acc_expanded_m2n2(c3, a3, b1);
|
||||
c4 = acc_expanded_m2n2(c4, a4, b1);
|
||||
}
|
||||
|
||||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
|
||||
store_expanded_m2n2(C, LDC, c1, e_alpha);
|
||||
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
|
||||
store_expanded_m2n2(C + 8, LDC, c3, e_alpha);
|
||||
store_expanded_m2n2(C + 12, LDC, c4, e_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_4x2(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
float32x4x4_t c1, c2;
|
||||
c1 = c2 = init_expanded_m2n2();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
|
||||
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
|
||||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b1);
|
||||
c1 = acc_expanded_m2n2(c1, a3, b2);
|
||||
c2 = acc_expanded_m2n2(c2, a4, b2);
|
||||
}
|
||||
if (K) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
|
||||
float32x4_t b1 = vld1q_f32(sb);
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b1);
|
||||
}
|
||||
|
||||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
|
||||
store_expanded_m2n2(C, LDC, c1, e_alpha);
|
||||
store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_2x4(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
float32x4x4_t c1, c2;
|
||||
c1 = c2 = init_expanded_m2n2();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
|
||||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
|
||||
float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a1, b2);
|
||||
c1 = acc_expanded_m2n2(c1, a2, b3);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b4);
|
||||
}
|
||||
if (K) {
|
||||
float32x4_t a1 = vld1q_f32(sa);
|
||||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a1, b2);
|
||||
}
|
||||
|
||||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
|
||||
store_expanded_m2n2(C, LDC, c1, e_alpha);
|
||||
store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_2x2(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
float32x4x4_t c1, c2;
|
||||
c1 = c2 = init_expanded_m2n2();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
|
||||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n2(c2, a2, b2);
|
||||
}
|
||||
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
|
||||
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
|
||||
c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]);
|
||||
c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]);
|
||||
if (K) {
|
||||
float32x4_t a1 = vld1q_f32(sa);
|
||||
float32x4_t b1 = vld1q_f32(sb);
|
||||
c1 = acc_expanded_m2n2(c1, a1, b1);
|
||||
}
|
||||
|
||||
store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai));
|
||||
}
|
||||
|
||||
static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc,
|
||||
float32x4_t a, float32x2_t b) {
|
||||
|
||||
acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0);
|
||||
acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1);
|
||||
return acc;
|
||||
}
|
||||
|
||||
static inline void store_expanded_m2n1(float *C,
|
||||
float32x4x2_t acc, float32x4x4_t expanded_alpha) {
|
||||
|
||||
float32x4_t ld1 = vld1q_f32(C);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
|
||||
acc.val[0] = vrev64q_f32(acc.val[0]);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
|
||||
acc.val[1] = vrev64q_f32(acc.val[1]);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
|
||||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
|
||||
vst1q_f32(C, ld1);
|
||||
}
|
||||
|
||||
static inline float32x4x2_t init_expanded_m2n1() {
|
||||
float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void kernel_8x1(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K) {
|
||||
|
||||
float32x4x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m2n1();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
|
||||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12),
|
||||
a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20),
|
||||
a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
|
||||
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
|
||||
c1 = acc_expanded_m2n1(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n1(c2, a2, b1);
|
||||
c3 = acc_expanded_m2n1(c3, a3, b1);
|
||||
c4 = acc_expanded_m2n1(c4, a4, b1);
|
||||
c1 = acc_expanded_m2n1(c1, a5, b2);
|
||||
c2 = acc_expanded_m2n1(c2, a6, b2);
|
||||
c3 = acc_expanded_m2n1(c3, a7, b2);
|
||||
c4 = acc_expanded_m2n1(c4, a8, b2);
|
||||
}
|
||||
if (K) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
|
||||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
|
||||
float32x2_t b1 = vld1_f32(sb);
|
||||
c1 = acc_expanded_m2n1(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n1(c2, a2, b1);
|
||||
c3 = acc_expanded_m2n1(c3, a3, b1);
|
||||
c4 = acc_expanded_m2n1(c4, a4, b1);
|
||||
}
|
||||
|
||||
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
store_expanded_m2n1(C, c1, expanded_alpha);
|
||||
store_expanded_m2n1(C + 4, c2, expanded_alpha);
|
||||
store_expanded_m2n1(C + 8, c3, expanded_alpha);
|
||||
store_expanded_m2n1(C + 12, c4, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_4x1(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K) {
|
||||
|
||||
float32x4x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m2n1();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
|
||||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
|
||||
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
|
||||
c1 = acc_expanded_m2n1(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n1(c2, a2, b1);
|
||||
c3 = acc_expanded_m2n1(c3, a3, b2);
|
||||
c4 = acc_expanded_m2n1(c4, a4, b2);
|
||||
}
|
||||
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
|
||||
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
|
||||
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
|
||||
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
|
||||
if (K) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
|
||||
float32x2_t b1 = vld1_f32(sb);
|
||||
c1 = acc_expanded_m2n1(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n1(c2, a2, b1);
|
||||
}
|
||||
|
||||
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
store_expanded_m2n1(C, c1, expanded_alpha);
|
||||
store_expanded_m2n1(C + 4, c2, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_2x1(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K) {
|
||||
|
||||
float32x4x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m2n1();
|
||||
|
||||
for (; K > 3; K -= 4) {
|
||||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
|
||||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
|
||||
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2),
|
||||
b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8;
|
||||
c1 = acc_expanded_m2n1(c1, a1, b1);
|
||||
c2 = acc_expanded_m2n1(c2, a2, b2);
|
||||
c3 = acc_expanded_m2n1(c3, a3, b3);
|
||||
c4 = acc_expanded_m2n1(c4, a4, b4);
|
||||
}
|
||||
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
|
||||
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
|
||||
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
|
||||
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
|
||||
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
|
||||
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
|
||||
for (; K; K--) {
|
||||
float32x4_t a1 = vld1q_f32(sa); sa += 4;
|
||||
float32x2_t b1 = vld1_f32(sb); sb += 2;
|
||||
c1 = acc_expanded_m2n1(c1, a1, b1);
|
||||
}
|
||||
|
||||
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
store_expanded_m2n1(C, c1, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) {
|
||||
float32x2x4_t ret;
|
||||
const float maskp[] = { -1, 1 };
|
||||
const float maskn[] = { 1, -1 };
|
||||
const float32x2_t vrevp = vld1_f32(maskp);
|
||||
const float32x2_t vrevn = vld1_f32(maskn);
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
ret.val[0] = vdup_n_f32(alphar);
|
||||
ret.val[1] = vdup_n_f32(-alphai);
|
||||
ret.val[2] = vmul_f32(ret.val[1], vrevn);
|
||||
ret.val[3] = vmul_f32(ret.val[0], vrevp);
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
ret.val[0] = vdup_n_f32(alphar);
|
||||
ret.val[1] = vdup_n_f32(alphai);
|
||||
ret.val[2] = vmul_f32(ret.val[1], vrevp);
|
||||
ret.val[3] = vmul_f32(ret.val[0], vrevn);
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
ret.val[2] = vdup_n_f32(alphai);
|
||||
ret.val[3] = vdup_n_f32(alphar);
|
||||
ret.val[0] = vmul_f32(ret.val[3], vrevn);
|
||||
ret.val[1] = vmul_f32(ret.val[2], vrevp);
|
||||
#else
|
||||
ret.val[2] = vdup_n_f32(alphai);
|
||||
ret.val[3] = vdup_n_f32(-alphar);
|
||||
ret.val[0] = vmul_f32(ret.val[3], vrevp);
|
||||
ret.val[1] = vmul_f32(ret.val[2], vrevn);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc,
|
||||
float32x2_t a, float32x2_t b) {
|
||||
|
||||
acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0);
|
||||
acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1);
|
||||
return acc;
|
||||
}
|
||||
|
||||
static inline void store_expanded_m1n1(float *C,
|
||||
float32x2x2_t acc, float32x2x4_t expanded_alpha) {
|
||||
|
||||
float32x2_t ld1 = vld1_f32(C);
|
||||
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]);
|
||||
acc.val[0] = vrev64_f32(acc.val[0]);
|
||||
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]);
|
||||
acc.val[1] = vrev64_f32(acc.val[1]);
|
||||
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]);
|
||||
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]);
|
||||
vst1_f32(C, ld1);
|
||||
}
|
||||
|
||||
static inline float32x2x2_t init_expanded_m1n1() {
|
||||
float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void kernel_1x4(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
float32x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m1n1();
|
||||
|
||||
for (; K; K--) {
|
||||
float32x2_t a1 = vld1_f32(sa); sa += 2;
|
||||
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
|
||||
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
|
||||
c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4));
|
||||
c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6));
|
||||
sb += 8;
|
||||
}
|
||||
|
||||
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
|
||||
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
|
||||
store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2;
|
||||
store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2;
|
||||
store_expanded_m1n1(C, c4, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_1x2(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
|
||||
|
||||
float32x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m1n1();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4;
|
||||
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
|
||||
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
|
||||
c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4));
|
||||
c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6));
|
||||
sb += 8;
|
||||
}
|
||||
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
|
||||
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
|
||||
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
|
||||
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
|
||||
if (K) {
|
||||
float32x2_t a1 = vld1_f32(sa);
|
||||
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
|
||||
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
|
||||
}
|
||||
|
||||
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
|
||||
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
|
||||
store_expanded_m1n1(C, c2, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_1x1(const float *sa, const float *sb, float *C,
|
||||
float alphar, float alphai, BLASLONG K) {
|
||||
|
||||
float32x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init_expanded_m1n1();
|
||||
|
||||
for (; K > 3; K -= 4) {
|
||||
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
|
||||
c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2));
|
||||
c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4));
|
||||
c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6));
|
||||
sa += 8; sb += 8;
|
||||
}
|
||||
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
|
||||
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
|
||||
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
|
||||
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
|
||||
c1.val[0] = vadd_f32(c1.val[0], c2.val[0]);
|
||||
c1.val[1] = vadd_f32(c1.val[1], c2.val[1]);
|
||||
for (; K; K--) {
|
||||
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
|
||||
sa += 2; sb += 2;
|
||||
}
|
||||
|
||||
store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai));
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
|
||||
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
|
||||
|
||||
BLASLONG n_left = N;
|
||||
for (; n_left >= 8; n_left -= 8) {
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c1_ = C;
|
||||
FLOAT *c2_ = C + LDC * 8;
|
||||
const FLOAT *b1_ = sb;
|
||||
const FLOAT *b2_ = sb + K * 8;
|
||||
BLASLONG m_left = M;
|
||||
for (; m_left >= 8; m_left -= 8) {
|
||||
kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC);
|
||||
kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC);
|
||||
a_ += 16 * K;
|
||||
c1_ += 16;
|
||||
c2_ += 16;
|
||||
}
|
||||
if (m_left >= 4) {
|
||||
m_left -= 4;
|
||||
kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC);
|
||||
kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC);
|
||||
a_ += 8 * K;
|
||||
c1_ += 8;
|
||||
c2_ += 8;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC);
|
||||
kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC);
|
||||
a_ += 4 * K;
|
||||
c1_ += 4;
|
||||
c2_ += 4;
|
||||
}
|
||||
if (m_left) {
|
||||
kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC);
|
||||
kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC);
|
||||
}
|
||||
C += 16 * LDC;
|
||||
sb += 16 * K;
|
||||
}
|
||||
|
||||
if (n_left >= 4) {
|
||||
n_left -= 4;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
BLASLONG m_left = M;
|
||||
for (; m_left >= 8; m_left -= 8) {
|
||||
kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
a_ += 16 * K;
|
||||
c_ += 16;
|
||||
}
|
||||
if (m_left >= 4) {
|
||||
m_left -= 4;
|
||||
kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
a_ += 8 * K;
|
||||
c_ += 8;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
a_ += 4 * K;
|
||||
c_ += 4;
|
||||
}
|
||||
if (m_left) {
|
||||
kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
}
|
||||
C += 8 * LDC;
|
||||
sb += 8 * K;
|
||||
}
|
||||
|
||||
if (n_left >= 2) {
|
||||
n_left -= 2;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
BLASLONG m_left = M;
|
||||
for (; m_left >= 8; m_left -= 8) {
|
||||
kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
a_ += 16 * K;
|
||||
c_ += 16;
|
||||
}
|
||||
if (m_left >= 4) {
|
||||
m_left -= 4;
|
||||
kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
a_ += 8 * K;
|
||||
c_ += 8;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
a_ += 4 * K;
|
||||
c_ += 4;
|
||||
}
|
||||
if (m_left) {
|
||||
kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC);
|
||||
}
|
||||
C += 4 * LDC;
|
||||
sb += 4 * K;
|
||||
}
|
||||
|
||||
if (n_left) {
|
||||
BLASLONG m_left = M;
|
||||
for (; m_left >= 8; m_left -= 8) {
|
||||
kernel_8x1(sa, sb, C, alphar, alphai, K);
|
||||
sa += 16 * K;
|
||||
C += 16;
|
||||
}
|
||||
if (m_left >= 4) {
|
||||
m_left -= 4;
|
||||
kernel_4x1(sa, sb, C, alphar, alphai, K);
|
||||
sa += 8 * K;
|
||||
C += 8;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
kernel_2x1(sa, sb, C, alphar, alphai, K);
|
||||
sa += 4 * K;
|
||||
C += 4;
|
||||
}
|
||||
if (m_left) {
|
||||
kernel_1x1(sa, sb, C, alphar, alphai, K);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,890 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
/**********************************************************
|
||||
* Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12
|
||||
* Operation: C[4][12] += alpha * sa[4][K] * sb[K][12]
|
||||
* Matrix orders:
|
||||
* sa: column-major (leading dimension == 4)
|
||||
* sb: 3 concatenated row-major 4-column submatrices
|
||||
* C: column-major (leading dimension == LDC)
|
||||
*********************************************************/
|
||||
static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
/** prefetch 4x12 elements from matrix C for RW purpose */
|
||||
__asm__ __volatile__(
|
||||
"mov x0,%[C]\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
|
||||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t"
|
||||
::[C]"r"(C), [LDC]"r"(LDC):"x0");
|
||||
|
||||
/** 3 pointers to 3 submatrices of sb respectively */
|
||||
const FLOAT *b1_ = sb;
|
||||
const FLOAT *b2_ = sb + K * 4;
|
||||
const FLOAT *b3_ = sb + K * 8;
|
||||
|
||||
/** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */
|
||||
/** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */
|
||||
/** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */
|
||||
/** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */
|
||||
/** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */
|
||||
|
||||
__asm__ __volatile__(
|
||||
"cmp %[K],#0\n\t"
|
||||
/** fill registers holding elements of C with 0.0 */
|
||||
"movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t"
|
||||
"movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t"
|
||||
"movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t"
|
||||
"movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t"
|
||||
"movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t"
|
||||
"movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t"
|
||||
"beq 4f; cmp %[K],#2\n\t"
|
||||
/** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */
|
||||
"ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t"
|
||||
"ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t"
|
||||
"blt 3f; beq 2f\n\t"
|
||||
"1:\n\t"
|
||||
/** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */
|
||||
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
|
||||
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
|
||||
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
|
||||
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
|
||||
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
|
||||
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
|
||||
"fmla v12.2d,v0.2d,v5.d[0]\n\t"
|
||||
"fmla v13.2d,v1.2d,v5.d[0]\n\t"
|
||||
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
|
||||
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
|
||||
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
|
||||
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
|
||||
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
|
||||
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
|
||||
"fmla v18.2d,v0.2d,v6.d[1]\n\t"
|
||||
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
|
||||
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
|
||||
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
|
||||
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
|
||||
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
|
||||
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
|
||||
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
|
||||
"fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t"
|
||||
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
|
||||
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
|
||||
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
|
||||
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
|
||||
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
|
||||
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
|
||||
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
|
||||
"fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t"
|
||||
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
|
||||
"ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t"
|
||||
"fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t"
|
||||
"fmla v9.2d,v3.2d,v6.d[0]\n\t"
|
||||
"fmla v10.2d,v2.2d,v6.d[1]\n\t"
|
||||
"ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t"
|
||||
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
|
||||
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
|
||||
"fmla v13.2d,v3.2d,v7.d[0]\n\t"
|
||||
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
|
||||
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
|
||||
"fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t"
|
||||
"fmla v16.2d,v2.2d,v4.d[0]\n\t"
|
||||
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
|
||||
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
|
||||
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
|
||||
"fmla v19.2d,v3.2d,v4.d[1]\n\t"
|
||||
"ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t"
|
||||
"fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t"
|
||||
"fmla v21.2d,v3.2d,v5.d[0]\n\t"
|
||||
"fmla v22.2d,v2.2d,v5.d[1]\n\t"
|
||||
"ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t"
|
||||
"fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t"
|
||||
"fmla v24.2d,v2.2d,v6.d[0]\n\t"
|
||||
"fmla v25.2d,v3.2d,v6.d[0]\n\t"
|
||||
"ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t"
|
||||
"fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t"
|
||||
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
|
||||
"fmla v28.2d,v2.2d,v7.d[0]\n\t"
|
||||
"ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t"
|
||||
"fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t"
|
||||
"fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t"
|
||||
"fmla v31.2d,v3.2d,v7.d[1]\n\t"
|
||||
"bgt 1b; blt 3f\n\t"
|
||||
"2:\n\t"
|
||||
/** tail part with k = 2 */
|
||||
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
|
||||
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
|
||||
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
|
||||
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
|
||||
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
|
||||
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
|
||||
"fmla v12.2d,v0.2d,v5.d[0]\n\t"
|
||||
"fmla v13.2d,v1.2d,v5.d[0]\n\t"
|
||||
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
|
||||
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
|
||||
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
|
||||
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
|
||||
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
|
||||
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
|
||||
"fmla v18.2d,v0.2d,v6.d[1]\n\t"
|
||||
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
|
||||
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
|
||||
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
|
||||
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
|
||||
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
|
||||
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
|
||||
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
|
||||
"fmla v24.2d,v0.2d,v4.d[0]\n\t"
|
||||
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
|
||||
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
|
||||
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
|
||||
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
|
||||
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
|
||||
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
|
||||
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
|
||||
"fmla v30.2d,v0.2d,v5.d[1]\n\t"
|
||||
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
|
||||
"fmov v4.d[1],x0\n\t"
|
||||
"fmla v8.2d,v2.2d,v6.d[0]\n\t"
|
||||
"fmla v9.2d,v3.2d,v6.d[0]\n\t"
|
||||
"fmla v10.2d,v2.2d,v6.d[1]\n\t"
|
||||
"ldr d5,[%[b2_],#48]\n\t"
|
||||
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
|
||||
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
|
||||
"fmla v13.2d,v3.2d,v7.d[0]\n\t"
|
||||
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
|
||||
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
|
||||
"fmla v15.2d,v3.2d,v7.d[1]\n\t"
|
||||
"fmla v16.2d,v2.2d,v4.d[0]\n\t"
|
||||
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
|
||||
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
|
||||
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
|
||||
"fmla v19.2d,v3.2d,v4.d[1]\n\t"
|
||||
"fmov v7.d[1],x0\n\t"
|
||||
"fmla v20.2d,v2.2d,v5.d[0]\n\t"
|
||||
"fmla v21.2d,v3.2d,v5.d[0]\n\t"
|
||||
"fmla v22.2d,v2.2d,v5.d[1]\n\t"
|
||||
"fmla v23.2d,v3.2d,v5.d[1]\n\t"
|
||||
"fmla v24.2d,v2.2d,v6.d[0]\n\t"
|
||||
"fmla v25.2d,v3.2d,v6.d[0]\n\t"
|
||||
"fmla v26.2d,v2.2d,v6.d[1]\n\t"
|
||||
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
|
||||
"fmla v28.2d,v2.2d,v7.d[0]\n\t"
|
||||
"fmla v29.2d,v3.2d,v7.d[0]\n\t"
|
||||
"fmla v30.2d,v2.2d,v7.d[1]\n\t"
|
||||
"fmla v31.2d,v3.2d,v7.d[1]\n\t"
|
||||
"b 4f\n\t"
|
||||
"3:\n\t"
|
||||
/** tail part with k = 1 */
|
||||
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
|
||||
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
|
||||
"fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t"
|
||||
"fmla v10.2d,v0.2d,v4.d[1]\n\t"
|
||||
"fmov v7.d[1],x0\n\t"
|
||||
"fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t"
|
||||
"fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t"
|
||||
"fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t"
|
||||
"ldr d4,[%[b3_]]\n\t"
|
||||
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
|
||||
"fmla v15.2d,v1.2d,v5.d[1]\n\t"
|
||||
"fmla v16.2d,v0.2d,v6.d[0]\n\t"
|
||||
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
|
||||
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
|
||||
"fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t"
|
||||
"fmla v19.2d,v1.2d,v6.d[1]\n\t"
|
||||
"fmov v5.d[1],x0\n\t"
|
||||
"fmla v20.2d,v0.2d,v7.d[0]\n\t"
|
||||
"fmla v21.2d,v1.2d,v7.d[0]\n\t"
|
||||
"fmla v22.2d,v0.2d,v7.d[1]\n\t"
|
||||
"fmla v23.2d,v1.2d,v7.d[1]\n\t"
|
||||
"fmla v24.2d,v0.2d,v4.d[0]\n\t"
|
||||
"fmla v25.2d,v1.2d,v4.d[0]\n\t"
|
||||
"fmla v26.2d,v0.2d,v4.d[1]\n\t"
|
||||
"fmla v27.2d,v1.2d,v4.d[1]\n\t"
|
||||
"fmla v28.2d,v0.2d,v5.d[0]\n\t"
|
||||
"fmla v29.2d,v1.2d,v5.d[0]\n\t"
|
||||
"fmla v30.2d,v0.2d,v5.d[1]\n\t"
|
||||
"fmla v31.2d,v1.2d,v5.d[1]\n\t"
|
||||
/** store 4x12 elements to C */
|
||||
"4:\n\t"
|
||||
"ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t"
|
||||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
|
||||
"fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t"
|
||||
"fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t"
|
||||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
|
||||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
|
||||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
|
||||
"fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t"
|
||||
"fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t"
|
||||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
|
||||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
|
||||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
|
||||
"fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t"
|
||||
"fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t"
|
||||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
|
||||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
|
||||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
|
||||
"fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t"
|
||||
"fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t"
|
||||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
|
||||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
|
||||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
|
||||
"fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t"
|
||||
"fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t"
|
||||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
|
||||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
|
||||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
|
||||
"fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t"
|
||||
"fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t"
|
||||
"stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t"
|
||||
:[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K)
|
||||
:[LDC]"r"(LDC), [alpha]"m"(alpha)
|
||||
:"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
}
|
||||
|
||||
/**********************************************************
|
||||
* Operation:
|
||||
C[0] += alpha * up[0]; C[1] += alpha * up[1];
|
||||
C[2] += alpha * down[0]; C[3] += alpha * down[1];
|
||||
*********************************************************/
|
||||
static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) {
|
||||
float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2);
|
||||
t1 = vfmaq_n_f64(t1, up, alpha);
|
||||
t2 = vfmaq_n_f64(t2, down, alpha);
|
||||
vst1q_f64(C, t1);
|
||||
vst1q_f64(C + 2, t2);
|
||||
}
|
||||
|
||||
/**********************************************************
|
||||
* Function: dgemm_kernel_arm64_4x4_m4n8
|
||||
* Operation: C[4][8] += alpha * sa[4][K] * sb[K][8]
|
||||
* Matrix orders:
|
||||
* sa: column-major (leading dimension == 4)
|
||||
* sb: 2 concatenated row-major 4-column submatrices
|
||||
* C: column-major (leading dimension == LDC)
|
||||
*********************************************************/
|
||||
static inline void dgemm_kernel_arm64_4x4_m4n8(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
const FLOAT *b1_ = sb;
|
||||
const FLOAT *b2_ = sb + K * 4;
|
||||
|
||||
/** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */
|
||||
float64x2_t c11, c12, c13, c14, c15, c16, c17, c18;
|
||||
float64x2_t c21, c22, c23, c24, c25, c26, c27, c28;
|
||||
c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0);
|
||||
c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0);
|
||||
|
||||
for (; K; K--) {
|
||||
float64x2_t a1 = vld1q_f64(sa);
|
||||
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
|
||||
float64x2_t b1 = vld1q_f64(b1_);
|
||||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
|
||||
c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
|
||||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
|
||||
c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
|
||||
|
||||
float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4;
|
||||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
|
||||
c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
|
||||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
|
||||
c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
|
||||
|
||||
float64x2_t b3 = vld1q_f64(b2_);
|
||||
c15 = vfmaq_laneq_f64(c15, a1, b3, 0);
|
||||
c25 = vfmaq_laneq_f64(c25, a2, b3, 0);
|
||||
c16 = vfmaq_laneq_f64(c16, a1, b3, 1);
|
||||
c26 = vfmaq_laneq_f64(c26, a2, b3, 1);
|
||||
|
||||
float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4;
|
||||
c17 = vfmaq_laneq_f64(c17, a1, b4, 0);
|
||||
c27 = vfmaq_laneq_f64(c27, a2, b4, 0);
|
||||
c18 = vfmaq_laneq_f64(c18, a1, b4, 1);
|
||||
c28 = vfmaq_laneq_f64(c28, a2, b4, 1);
|
||||
}
|
||||
|
||||
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c14, c24, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c15, c25, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c16, c26, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c17, c27, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c18, c28, alpha);
|
||||
}
|
||||
|
||||
/**********************************************************
|
||||
* Function: dgemm_kernel_arm64_4x4_m4n4
|
||||
* Operation: C[4][4] += alpha * sa[4][K] * sb[K][4]
|
||||
* Matrix orders:
|
||||
* sa: column-major (leading dimension == 4)
|
||||
* sb: row-major (leading dimension == 4)
|
||||
* C: column-major (leading dimension == LDC)
|
||||
*********************************************************/
|
||||
static inline void dgemm_kernel_arm64_4x4_m4n4(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c11, c21, c12, c22, c13, c23, c14, c24;
|
||||
c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0);
|
||||
|
||||
for (; K; K--) {
|
||||
float64x2_t a1 = vld1q_f64(sa);
|
||||
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
float64x2_t b1 = vld1q_f64(sb);
|
||||
float64x2_t b2 = vld1q_f64(sb + 2); sb += 4;
|
||||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
|
||||
c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
|
||||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
|
||||
c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
|
||||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
|
||||
c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
|
||||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
|
||||
c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
|
||||
}
|
||||
|
||||
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c14, c24, alpha);
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m4n2(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2;
|
||||
c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
|
||||
float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2),
|
||||
a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8;
|
||||
c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0);
|
||||
c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0);
|
||||
c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1);
|
||||
c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1);
|
||||
c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0);
|
||||
c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0);
|
||||
c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1);
|
||||
c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1);
|
||||
}
|
||||
c11_1 = vaddq_f64(c11_1, c11_2);
|
||||
c21_1 = vaddq_f64(c21_1, c21_2);
|
||||
c12_1 = vaddq_f64(c12_1, c12_2);
|
||||
c22_1 = vaddq_f64(c22_1, c22_2);
|
||||
if (K) {
|
||||
float64x2_t b1 = vld1q_f64(sb); sb += 2;
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0);
|
||||
c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0);
|
||||
c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1);
|
||||
c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1);
|
||||
}
|
||||
|
||||
dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC;
|
||||
dgemm_store_m4n1(C, c12_1, c22_1, alpha);
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m4n1(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c11_1, c11_2, c21_1, c21_2;
|
||||
c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float64x2_t b1 = vld1q_f64(sb); sb += 2;
|
||||
c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0);
|
||||
c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0);
|
||||
c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1);
|
||||
c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1);
|
||||
sa += 8;
|
||||
}
|
||||
c11_1 = vaddq_f64(c11_1, c11_2);
|
||||
c21_1 = vaddq_f64(c21_1, c21_2);
|
||||
if (K) {
|
||||
double b1 = *sb++;
|
||||
c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1);
|
||||
c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1);
|
||||
sa += 4;
|
||||
}
|
||||
|
||||
dgemm_store_m4n1(C, c11_1, c21_1, alpha);
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m2n12(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24;
|
||||
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 =
|
||||
c21 = c22 = c23 = c24 = vdupq_n_f64(0);
|
||||
|
||||
const FLOAT *b1_ = sb;
|
||||
const FLOAT *b2_ = sb + 4 * K;
|
||||
const FLOAT *b3_ = b2_ + 4 * K;
|
||||
|
||||
for (; K; K--) {
|
||||
const float64x2_t a1 = vld1q_f64(sa); sa += 2;
|
||||
|
||||
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
|
||||
c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
|
||||
c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
|
||||
c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
|
||||
c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
|
||||
|
||||
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
|
||||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
|
||||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
|
||||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
|
||||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
|
||||
|
||||
b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4;
|
||||
c21 = vfmaq_laneq_f64(c21, a1, b1, 0);
|
||||
c22 = vfmaq_laneq_f64(c22, a1, b1, 1);
|
||||
c23 = vfmaq_laneq_f64(c23, a1, b2, 0);
|
||||
c24 = vfmaq_laneq_f64(c24, a1, b2, 1);
|
||||
}
|
||||
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha));
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m2n8(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14;
|
||||
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0);
|
||||
|
||||
const FLOAT *b1_ = sb;
|
||||
const FLOAT *b2_ = sb + 4 * K;
|
||||
|
||||
for (; K; K--) {
|
||||
const float64x2_t a1 = vld1q_f64(sa); sa += 2;
|
||||
|
||||
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
|
||||
c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
|
||||
c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
|
||||
c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
|
||||
c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
|
||||
|
||||
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
|
||||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
|
||||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
|
||||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
|
||||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
|
||||
}
|
||||
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha));
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m2n4(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2;
|
||||
c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2);
|
||||
float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8;
|
||||
|
||||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0);
|
||||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1);
|
||||
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0);
|
||||
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1);
|
||||
|
||||
c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0);
|
||||
c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1);
|
||||
c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0);
|
||||
c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1);
|
||||
}
|
||||
c1_1 = vaddq_f64(c1_1, c1_2);
|
||||
c2_1 = vaddq_f64(c2_1, c2_2);
|
||||
c3_1 = vaddq_f64(c3_1, c3_2);
|
||||
c4_1 = vaddq_f64(c4_1, c4_2);
|
||||
if (K) {
|
||||
float64x2_t a1 = vld1q_f64(sa); sa += 2;
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
|
||||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
|
||||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
|
||||
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0);
|
||||
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1);
|
||||
}
|
||||
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha));
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m2n2(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1_1, c1_2, c2_1, c2_2;
|
||||
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
|
||||
|
||||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
|
||||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
|
||||
c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0);
|
||||
c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1);
|
||||
}
|
||||
c1_1 = vaddq_f64(c1_1, c1_2);
|
||||
c2_1 = vaddq_f64(c2_1, c2_2);
|
||||
if (K) {
|
||||
float64x2_t a1 = vld1q_f64(sa); sa += 2;
|
||||
float64x2_t b1 = vld1q_f64(sb); sb += 2;
|
||||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
|
||||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
|
||||
}
|
||||
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha));
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m2n1(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *c,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 3; K -= 4) {
|
||||
float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4;
|
||||
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0);
|
||||
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1);
|
||||
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0);
|
||||
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1);
|
||||
sa += 8;
|
||||
}
|
||||
c1 = vaddq_f64(c1, c2);
|
||||
c3 = vaddq_f64(c3, c4);
|
||||
c1 = vaddq_f64(c1, c3);
|
||||
for (; K; K--) {
|
||||
c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++);
|
||||
sa += 2;
|
||||
}
|
||||
|
||||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha));
|
||||
}
|
||||
|
||||
static inline void dgemm_store_m1n2(double *C, float64x2_t vc,
|
||||
double alpha, BLASLONG LDC) {
|
||||
double c0 = vgetq_lane_f64(vc, 0);
|
||||
double c1 = vgetq_lane_f64(vc, 1);
|
||||
C[0] += c0 * alpha;
|
||||
C[LDC] += c1 * alpha;
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m1n12(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1, c2, c3, c4, c5, c6;
|
||||
c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0);
|
||||
|
||||
const double *b1_ = sb;
|
||||
const double *b2_ = sb + 4 * K;
|
||||
const double *b3_ = b2_ + 4 * K;
|
||||
|
||||
for (; K; K--) {
|
||||
const double a1 = *sa++;
|
||||
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
|
||||
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
|
||||
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
|
||||
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
|
||||
c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1);
|
||||
c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4;
|
||||
}
|
||||
|
||||
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c6, alpha, LDC);
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m1n8(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
|
||||
|
||||
const double *b1_ = sb;
|
||||
const double *b2_ = sb + 4 * K;
|
||||
|
||||
for (; K; K--) {
|
||||
const double a1 = *sa++;
|
||||
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
|
||||
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
|
||||
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
|
||||
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
|
||||
}
|
||||
|
||||
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c4, alpha, LDC);
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m1n4(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1_1, c1_2, c2_1, c2_2;
|
||||
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float64x2_t a1 = vld1q_f64(sa); sa += 2;
|
||||
c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0);
|
||||
c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0);
|
||||
c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1);
|
||||
c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8;
|
||||
}
|
||||
c1_1 = vaddq_f64(c1_1, c1_2);
|
||||
c2_1 = vaddq_f64(c2_1, c2_2);
|
||||
if (K) {
|
||||
double a1 = *sa++;
|
||||
c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1);
|
||||
c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1);
|
||||
sb += 4;
|
||||
}
|
||||
|
||||
dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2;
|
||||
dgemm_store_m1n2(C, c2_1, alpha, LDC);
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m1n2(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 3; K -= 4) {
|
||||
float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4;
|
||||
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0);
|
||||
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1);
|
||||
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0);
|
||||
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8;
|
||||
}
|
||||
c1 = vaddq_f64(c1, c2);
|
||||
c3 = vaddq_f64(c3, c4);
|
||||
c1 = vaddq_f64(c1, c3);
|
||||
for (; K; K--) {
|
||||
c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++);
|
||||
sb += 2;
|
||||
}
|
||||
|
||||
dgemm_store_m1n2(C, c1, alpha, LDC);
|
||||
}
|
||||
|
||||
static inline void dgemm_kernel_arm64_4x4_m1n1(
|
||||
const FLOAT *sa, const FLOAT *sb, FLOAT *C,
|
||||
BLASLONG K, BLASLONG LDC, FLOAT alpha) {
|
||||
|
||||
float64x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = vdupq_n_f64(0);
|
||||
|
||||
for (; K > 7; K -= 8) {
|
||||
c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa));
|
||||
c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2));
|
||||
c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4));
|
||||
c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6));
|
||||
sa += 8; sb += 8;
|
||||
}
|
||||
c1 = vaddq_f64(c1, c2);
|
||||
c3 = vaddq_f64(c3, c4);
|
||||
c1 = vaddq_f64(c1, c3);
|
||||
double cs1 = vpaddd_f64(c1);
|
||||
for (; K; K--) {
|
||||
cs1 += (*sa++) * (*sb++);
|
||||
}
|
||||
|
||||
C[0] += cs1 * alpha;
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
|
||||
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
|
||||
|
||||
for (; N >= 12; N -= 12) {
|
||||
BLASLONG m_left = M;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 4;
|
||||
a_ += 4 * K;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 2;
|
||||
a_ += 2 * K;
|
||||
}
|
||||
if (m_left) {
|
||||
dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha);
|
||||
}
|
||||
sb += 12 * K;
|
||||
C += 12 * LDC;
|
||||
}
|
||||
|
||||
if (N >= 8) {
|
||||
N -= 8;
|
||||
BLASLONG m_left = M;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 4;
|
||||
a_ += 4 * K;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 2;
|
||||
a_ += 2 * K;
|
||||
}
|
||||
if (m_left) {
|
||||
dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha);
|
||||
}
|
||||
sb += 8 * K;
|
||||
C += 8 * LDC;
|
||||
} else if (N >= 4) {
|
||||
N -= 4;
|
||||
BLASLONG m_left = M;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 4;
|
||||
a_ += 4 * K;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 2;
|
||||
a_ += 2 * K;
|
||||
}
|
||||
if (m_left) {
|
||||
dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha);
|
||||
}
|
||||
sb += 4 * K;
|
||||
C += 4 * LDC;
|
||||
}
|
||||
|
||||
if (N >= 2) {
|
||||
N -= 2;
|
||||
BLASLONG m_left = M;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 4;
|
||||
a_ += 4 * K;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 2;
|
||||
a_ += 2 * K;
|
||||
}
|
||||
if (m_left) {
|
||||
dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha);
|
||||
}
|
||||
sb += 2 * K;
|
||||
C += 2 * LDC;
|
||||
}
|
||||
|
||||
if (N) {
|
||||
BLASLONG m_left = M;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 4;
|
||||
a_ += 4 * K;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha);
|
||||
c_ += 2;
|
||||
a_ += 2 * K;
|
||||
}
|
||||
if (m_left) {
|
||||
dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,874 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/* X0 X1 X2 s0 X3 x4 x5 x6 */
|
||||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
|
||||
|
||||
#define origM x0
|
||||
#define origN x1
|
||||
#define origK x2
|
||||
#define origPA x3
|
||||
#define origPB x4
|
||||
#define pC x5
|
||||
#define LDC x6
|
||||
#define temp x7
|
||||
#define counterL x8
|
||||
#define counterI x9
|
||||
#define counterJ x10
|
||||
#define pB x11
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
|
||||
#define lanes x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaZ z2.d
|
||||
|
||||
#define A_PRE_SIZE 1536
|
||||
#define B_PRE_SIZE 512
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
// 02 origK
|
||||
// 03 origPA
|
||||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
// 11 pB
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 lanes
|
||||
// 16 pA
|
||||
// 17
|
||||
// 18 must save
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
// 27 must save
|
||||
// 28 must save
|
||||
// 29 frame
|
||||
// 30 link
|
||||
// 31 sp
|
||||
|
||||
//v00 ALPHA -> pA0_0
|
||||
//v01 pA0_1
|
||||
//v02 ALPHA0
|
||||
//v03
|
||||
//v04
|
||||
//v05
|
||||
//v06
|
||||
//v07
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB0_4
|
||||
//v13 must save pB0_5
|
||||
//v14 must save pB0_6
|
||||
//v15 must save pB0_7
|
||||
//v16 must save C0
|
||||
//v17 must save C1
|
||||
//v18 must save C2
|
||||
//v19 must save C3
|
||||
//v20 must save C4
|
||||
//v21 must save C5
|
||||
//v22 must save C6
|
||||
//v23 must save C7
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
.macro INITv1x8
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
dup z18.d, #0
|
||||
dup z19.d, #0
|
||||
dup z20.d, #0
|
||||
dup z21.d, #0
|
||||
dup z22.d, #0
|
||||
dup z23.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_I
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M1
|
||||
ld1d z1.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M2
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
fmla z16.d, p1/m, z1.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
fmla z17.d, p1/m, z1.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
fmla z18.d, p1/m, z1.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
fmla z19.d, p1/m, z1.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z1.d, z12.d
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.d, p1/m, z1.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z1.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z1.d, z15.d
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_E
|
||||
fmla z16.d, p1/m, z1.d, z8.d
|
||||
fmla z17.d, p1/m, z1.d, z9.d
|
||||
fmla z18.d, p1/m, z1.d, z10.d
|
||||
fmla z19.d, p1/m, z1.d, z11.d
|
||||
fmla z20.d, p1/m, z1.d, z12.d
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.d, p1/m, z1.d, z13.d
|
||||
fmla z22.d, p1/m, z1.d, z14.d
|
||||
fmla z23.d, p1/m, z1.d, z15.d
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z26.d, p1/z, [pCRow2]
|
||||
fmla z26.d, p1/m, z18.d, alphaZ
|
||||
st1d z26.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z27.d, p1/z, [pCRow1]
|
||||
fmla z27.d, p1/m, z19.d, alphaZ
|
||||
st1d z27.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z28.d, p1/z, [pCRow2]
|
||||
fmla z28.d, p1/m, z20.d, alphaZ
|
||||
st1d z28.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z29.d, p1/z, [pCRow1]
|
||||
fmla z29.d, p1/m, z21.d, alphaZ
|
||||
st1d z29.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z30.d, p1/z, [pCRow2]
|
||||
fmla z30.d, p1/m, z22.d, alphaZ
|
||||
st1d z30.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1d z31.d, p1/z, [pCRow1]
|
||||
fmla z31.d, p1/m, z23.d, alphaZ
|
||||
st1d z31.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x4
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
dup z18.d, #0
|
||||
dup z19.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z26.d, p1/z, [pCRow2]
|
||||
fmla z26.d, p1/m, z18.d, alphaZ
|
||||
st1d z26.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1d z27.d, p1/z, [pCRow1]
|
||||
fmla z27.d, p1/m, z19.d, alphaZ
|
||||
st1d z27.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x2
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x2_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
||||
add pB, pB, 16
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x1
|
||||
dup z16.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x1_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
|
||||
add pB, pB, 8
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, d0
|
||||
dup alphaZ, alpha
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
ptrue p0.d // create true predicate
|
||||
|
||||
mov pB, origPB
|
||||
// Loop over N
|
||||
mov counterJ, origN
|
||||
asr counterJ, counterJ, #3 // J = J / 8
|
||||
cmp counterJ, #0
|
||||
ble .Ldgemm_kernel_L4_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat this as long as there are 8 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_BEGIN:
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #3 // add 8 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
||||
|
||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x8 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
blt .Ldgemm_kernel_L8_Mv1_32
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble .Ldgemm_kernel_L8_Mv1_22a
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L8_Mv1_22
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22a:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble .Ldgemm_kernel_L8_Mv1_40
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_40:
|
||||
|
||||
INITv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L8_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_46:
|
||||
|
||||
KERNELv1x8_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L8_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
b.any .Ldgemm_kernel_L8_Mv1_20
|
||||
|
||||
.Ldgemm_kernel_L8_END:
|
||||
|
||||
lsl temp, origK, #6
|
||||
add origPB, origPB, temp // B = B + K * 8 * 8
|
||||
|
||||
subs counterJ, counterJ , #1 // j--
|
||||
bgt .Ldgemm_kernel_L8_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 4 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #4
|
||||
ble .Ldgemm_kernel_L2_BEGIN
|
||||
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #2 // add 4 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x4 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L4_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L4_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L4_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L4_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L4_END:
|
||||
lsl temp, origK, #5
|
||||
add origPB, origPB, temp // B = B + K * 4 * 8
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 2 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #2
|
||||
ble .Ldgemm_kernel_L1_BEGIN
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #1 // add 2 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x2 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L2_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L2_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L2_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L2_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x2
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L2_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L2_END:
|
||||
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 1 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #1
|
||||
ble .Ldgemm_kernel_L999 // done
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC // add 1 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x1 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 8 to do?
|
||||
ble .Ldgemm_kernel_L1_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L1_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x1
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L1_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L1_END:
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Ldgemm_kernel_L999:
|
||||
mov x0, #0 // set return value
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,79 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint64_t lda_vec = svindex_s64(0LL, lda);
|
||||
uint64_t sve_size = svcntd();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
|
||||
svst1_f64(pg, (double *) boffset, a_vec);
|
||||
aoffset1++;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size * lda;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
uint64_t sve_size = svcntd();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
|
||||
svst1_f64(pg, (double *) boffset, a_vec);
|
||||
aoffset1 += lda;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,874 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/* X0 X1 X2 s0 X3 x4 x5 x6 */
|
||||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
|
||||
|
||||
#define origM x0
|
||||
#define origN x1
|
||||
#define origK x2
|
||||
#define origPA x3
|
||||
#define origPB x4
|
||||
#define pC x5
|
||||
#define LDC x6
|
||||
#define temp x7
|
||||
#define counterL x8
|
||||
#define counterI x9
|
||||
#define counterJ x10
|
||||
#define pB x11
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
|
||||
#define lanes x15
|
||||
#define pA x16
|
||||
#define alpha w17
|
||||
|
||||
#define alpha0 s10
|
||||
#define alphaZ z2.s
|
||||
|
||||
#define A_PRE_SIZE 1536
|
||||
#define B_PRE_SIZE 512
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
// 02 origK
|
||||
// 03 origPA
|
||||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
// 11 pB
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 lanes
|
||||
// 16 pA
|
||||
// 17
|
||||
// 18 must save
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
// 27 must save
|
||||
// 28 must save
|
||||
// 29 frame
|
||||
// 30 link
|
||||
// 31 sp
|
||||
|
||||
//v00 ALPHA -> pA0_0
|
||||
//v01 pA0_1
|
||||
//v02 ALPHA0
|
||||
//v03
|
||||
//v04
|
||||
//v05
|
||||
//v06
|
||||
//v07
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB0_4
|
||||
//v13 must save pB0_5
|
||||
//v14 must save pB0_6
|
||||
//v15 must save pB0_7
|
||||
//v16 must save C0
|
||||
//v17 must save C1
|
||||
//v18 must save C2
|
||||
//v19 must save C3
|
||||
//v20 must save C4
|
||||
//v21 must save C5
|
||||
//v22 must save C6
|
||||
//v23 must save C7
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
.macro INITv1x8
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
dup z18.s, #0
|
||||
dup z19.s, #0
|
||||
dup z20.s, #0
|
||||
dup z21.s, #0
|
||||
dup z22.s, #0
|
||||
dup z23.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_I
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
fmla z20.s, p1/m, z0.s, z12.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
fmla z21.s, p1/m, z0.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
fmla z22.s, p1/m, z0.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
fmla z23.s, p1/m, z0.s, z15.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M1
|
||||
ld1w z1.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
fmla z20.s, p1/m, z0.s, z12.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
fmla z21.s, p1/m, z0.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
fmla z22.s, p1/m, z0.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
fmla z23.s, p1/m, z0.s, z15.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M2
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
fmla z16.s, p1/m, z1.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
fmla z17.s, p1/m, z1.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
fmla z18.s, p1/m, z1.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
fmla z19.s, p1/m, z1.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
fmla z20.s, p1/m, z1.s, z12.s
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.s, p1/m, z1.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
fmla z22.s, p1/m, z1.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
fmla z23.s, p1/m, z1.s, z15.s
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_E
|
||||
fmla z16.s, p1/m, z1.s, z8.s
|
||||
fmla z17.s, p1/m, z1.s, z9.s
|
||||
fmla z18.s, p1/m, z1.s, z10.s
|
||||
fmla z19.s, p1/m, z1.s, z11.s
|
||||
fmla z20.s, p1/m, z1.s, z12.s
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.s, p1/m, z1.s, z13.s
|
||||
fmla z22.s, p1/m, z1.s, z14.s
|
||||
fmla z23.s, p1/m, z1.s, z15.s
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
fmla z20.s, p1/m, z0.s, z12.s
|
||||
fmla z21.s, p1/m, z0.s, z13.s
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z22.s, p1/m, z0.s, z14.s
|
||||
fmla z23.s, p1/m, z0.s, z15.s
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z25.s, p1/z, [pCRow1]
|
||||
fmla z25.s, p1/m, z17.s, alphaZ
|
||||
st1w z25.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z26.s, p1/z, [pCRow2]
|
||||
fmla z26.s, p1/m, z18.s, alphaZ
|
||||
st1w z26.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z27.s, p1/z, [pCRow1]
|
||||
fmla z27.s, p1/m, z19.s, alphaZ
|
||||
st1w z27.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z28.s, p1/z, [pCRow2]
|
||||
fmla z28.s, p1/m, z20.s, alphaZ
|
||||
st1w z28.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z29.s, p1/z, [pCRow1]
|
||||
fmla z29.s, p1/m, z21.s, alphaZ
|
||||
st1w z29.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z30.s, p1/z, [pCRow2]
|
||||
fmla z30.s, p1/m, z22.s, alphaZ
|
||||
st1w z30.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1w z31.s, p1/z, [pCRow1]
|
||||
fmla z31.s, p1/m, z23.s, alphaZ
|
||||
st1w z31.s, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x4
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
dup z18.s, #0
|
||||
dup z19.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
|
||||
add pB, pB, 16
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z25.s, p1/z, [pCRow1]
|
||||
fmla z25.s, p1/m, z17.s, alphaZ
|
||||
st1w z25.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z26.s, p1/z, [pCRow2]
|
||||
fmla z26.s, p1/m, z18.s, alphaZ
|
||||
st1w z26.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1w z27.s, p1/z, [pCRow1]
|
||||
fmla z27.s, p1/m, z19.s, alphaZ
|
||||
st1w z27.s, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x2
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x2_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
|
||||
add pB, pB, 8
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1w z25.s, p1/z, [pCRow1]
|
||||
fmla z25.s, p1/m, z17.s, alphaZ
|
||||
st1w z25.s, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x1
|
||||
dup z16.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x1_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
|
||||
add pB, pB, 4
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, s0
|
||||
dup alphaZ, alpha
|
||||
|
||||
lsl LDC, LDC, #2 // ldc = ldc * 4
|
||||
ptrue p0.s // create true predicate
|
||||
|
||||
mov pB, origPB
|
||||
// Loop over N
|
||||
mov counterJ, origN
|
||||
asr counterJ, counterJ, #3 // J = J / 8
|
||||
cmp counterJ, #0
|
||||
ble .Ldgemm_kernel_L4_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat this as long as there are 8 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_BEGIN:
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #3 // add 8 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
||||
|
||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM
|
||||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x8 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
blt .Ldgemm_kernel_L8_Mv1_32
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble .Ldgemm_kernel_L8_Mv1_22a
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L8_Mv1_22
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22a:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble .Ldgemm_kernel_L8_Mv1_40
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_40:
|
||||
|
||||
INITv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L8_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_46:
|
||||
|
||||
KERNELv1x8_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L8_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
|
||||
b.any .Ldgemm_kernel_L8_Mv1_20
|
||||
|
||||
.Ldgemm_kernel_L8_END:
|
||||
|
||||
lsl temp, origK, #5
|
||||
add origPB, origPB, temp // B = B + K * 8 * 4
|
||||
|
||||
subs counterJ, counterJ , #1 // j--
|
||||
bgt .Ldgemm_kernel_L8_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 4 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #4
|
||||
ble .Ldgemm_kernel_L2_BEGIN
|
||||
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #2 // add 4 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x4 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L4_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L4_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L4_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Ldgemm_kernel_L4_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L4_END:
|
||||
lsl temp, origK, #4
|
||||
add origPB, origPB, temp // B = B + K * 4 * 4
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 2 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #2
|
||||
ble .Ldgemm_kernel_L1_BEGIN
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #1 // add 2 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x2 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L2_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L2_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L2_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L2_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x2
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Ldgemm_kernel_L2_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L2_END:
|
||||
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 1 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #1
|
||||
ble .Ldgemm_kernel_L999 // done
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC // add 1 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x1 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 8 to do?
|
||||
ble .Ldgemm_kernel_L1_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L1_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x1
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Ldgemm_kernel_L1_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L1_END:
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Ldgemm_kernel_L999:
|
||||
mov x0, #0 // set return value
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,78 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint32_t lda_vec = svindex_s32(0LL, lda);
|
||||
uint32_t sve_size = svcntw();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
|
||||
svst1_f32(pg, (float *) boffset, a_vec);
|
||||
aoffset1++;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size * lda;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
uint32_t sve_size = svcntw();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
|
||||
svst1_f32(pg, (float *) boffset, a_vec);
|
||||
aoffset1 += lda;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,143 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, offset;
|
||||
|
||||
#if defined(DOUBLE)
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint64_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
#else
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint32_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,143 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, offset;
|
||||
|
||||
#if defined(DOUBLE)
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint64_t gat_ind = svsel(cmp, temp2, temp1);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, one_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
#else
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint32_t gat_ind = svsel(cmp, temp2, temp1);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, one_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
js = 0;
|
||||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posY + posX * lda;
|
||||
} else {
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X < posY) {
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k <= j; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posY + posX * lda;
|
||||
} else {
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY) {
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X < posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1(pn, ao);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
for (int k = j; k < n_active; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * lda;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
js = 0;
|
||||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posX + posY * lda;
|
||||
} else {
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
for (int k = j; k < n_active; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posX + posY * lda;
|
||||
} else {
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY) {
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1(pn, ao);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k <= j; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * lda;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,736 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
/*******************************************************************************
|
||||
The complex GEMM kernels in OpenBLAS use static configuration of conjugation
|
||||
modes via specific macros:
|
||||
|
||||
MACRO_NAME | conjugation on matrix A | conjugation on matrix B |
|
||||
---------- | ----------------------- | ----------------------- |
|
||||
NN/NT/TN/TT | No | No |
|
||||
NR/NC/TR/TC | No | Yes |
|
||||
RN/RT/CN/CT | Yes | No |
|
||||
RR/RC/CR/CC | Yes | Yes |
|
||||
|
||||
"conjugation on matrix A" means the complex conjugates of elements from
|
||||
matrix A are used for matmul (rather than the original elements). "conjugation
|
||||
on matrix B" means the complex conjugate of each element from matrix B is taken
|
||||
for matrix multiplication, respectively.
|
||||
|
||||
Complex numbers in arrays or matrices are usually packed together as an
|
||||
array of struct (without padding):
|
||||
struct complex_number {
|
||||
FLOAT real_part;
|
||||
FLOAT imag_part;
|
||||
};
|
||||
|
||||
For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF
|
||||
DOUBLE, the real part of its Kth complex number can be accessed as
|
||||
ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1].
|
||||
|
||||
This file uses 2 ways to vectorize matrix multiplication of complex numbers:
|
||||
|
||||
(1) Expanded-form
|
||||
|
||||
During accumulation along direction K:
|
||||
|
||||
Σk(a[0][k].real b[k][n].real)
|
||||
accumulate Σk(a[0][k].imag b[k][n].real)
|
||||
-------------------> .
|
||||
| * b[k][n].real .
|
||||
| (broadcasted) .
|
||||
a[0][k].real Σk(a[v-1][k].real b[k][n].real)
|
||||
a[0][k].imag Σk(a[v-1][k].imag b[k][n].real)
|
||||
. VECTOR I
|
||||
(vec_a) .
|
||||
.
|
||||
a[v-1][k].real Σk(a[0][k].real b[k][n].imag)
|
||||
a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag)
|
||||
| .
|
||||
| accumulate .
|
||||
-------------------> .
|
||||
* b[k][n].imag Σk(a[v-1][k].real b[k][n].imag)
|
||||
(broadcasted) Σk(a[v-1][k].imag b[k][n].imag)
|
||||
VECTOR II
|
||||
|
||||
After accumulation, prior to storage:
|
||||
|
||||
-1 -Σk(a[0][k].imag b[k][n].imag)
|
||||
1 Σk(a[0][k].real b[k][n].imag)
|
||||
. .
|
||||
VECTOR II permute and multiply . to get .
|
||||
. .
|
||||
-1 -Σk(a[v-1][k].imag b[k][n].imag)
|
||||
1 Σk(a[v-1][k].real b[k][n].imag)
|
||||
|
||||
then add with VECTOR I to get the result vector of elements of C.
|
||||
|
||||
2 vector registers are needed for every v elements of C, with
|
||||
v == sizeof(vector) / sizeof(complex)
|
||||
|
||||
(2) Contracted-form
|
||||
|
||||
During accumulation along direction K:
|
||||
|
||||
(the K coordinate is not shown, since the operation is identical for each k)
|
||||
|
||||
(load vector in mem) (load vector in mem)
|
||||
a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i
|
||||
| |
|
||||
| unzip operation (or VLD2 in arm neon) |
|
||||
-----------------------------------------------------
|
||||
|
|
||||
|
|
||||
--------------------------------------------------
|
||||
| |
|
||||
| |
|
||||
v v
|
||||
a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag
|
||||
| | | |
|
||||
| | * b[i].imag(broadcast) | |
|
||||
* b[i].real | -----------------------------|---- | * b[i].real
|
||||
(broadcast) | | | | (broadcast)
|
||||
| ------------------------------ | |
|
||||
+ | - | * b[i].imag(broadcast) + | + |
|
||||
v v v v
|
||||
(accumulate) (accumulate)
|
||||
c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag
|
||||
VECTOR_REAL VECTOR_IMAG
|
||||
|
||||
After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved)
|
||||
then stored to matrix C directly.
|
||||
|
||||
For 2v elements of C, only 2 vector registers are needed, while
|
||||
4 registers are required for expanded-form.
|
||||
(v == sizeof(vector) / sizeof(complex))
|
||||
|
||||
For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers
|
||||
to store elements of C when using expanded-form calculation, where
|
||||
the register spilling will occur. So contracted-form operation is
|
||||
selected for 4x4 kernel. As for all other combinations of unroll parameters
|
||||
(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more
|
||||
NEON registers into usage to hide latency of multiply-add instructions.
|
||||
******************************************************************************/
|
||||
|
||||
static inline float64x2_t set_f64x2(double lo, double hi) {
|
||||
float64x2_t ret = vdupq_n_f64(0);
|
||||
ret = vsetq_lane_f64(lo, ret, 0);
|
||||
ret = vsetq_lane_f64(hi, ret, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) {
|
||||
float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }};
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*****************************************************************
|
||||
* operation: *c += alpha * c_value //complex multiplication
|
||||
* expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r }
|
||||
* expanded_c: {{ arbr, aibr }, { arbi, aibi }}
|
||||
****************************************************************/
|
||||
static inline void store_1c(double *c, float64x2x2_t expanded_c,
|
||||
float64x2x2_t expanded_alpha) {
|
||||
float64x2_t ld = vld1q_f64(c);
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
|
||||
double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
|
||||
double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
|
||||
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
|
||||
#else
|
||||
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
|
||||
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
|
||||
#endif
|
||||
ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real);
|
||||
vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag));
|
||||
}
|
||||
|
||||
static inline void pref_c_4(const double *c) {
|
||||
__asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):);
|
||||
}
|
||||
|
||||
static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) {
|
||||
float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]),
|
||||
vaddq_f64(ec1.val[1], ec2.val[1]) }};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) {
|
||||
float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline float64x2x2_t init() {
|
||||
float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void kernel_1x1(const double *sa, const double *sb, double *C,
|
||||
BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init();
|
||||
|
||||
for (; K > 3; K -= 4) {
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
|
||||
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
|
||||
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
|
||||
c1 = update_ec(c1, a1, b1);
|
||||
c2 = update_ec(c2, a2, b2);
|
||||
c3 = update_ec(c3, a3, b3);
|
||||
c4 = update_ec(c4, a4, b4);
|
||||
}
|
||||
c1 = add_ec(c1, c2);
|
||||
c3 = add_ec(c3, c4);
|
||||
c1 = add_ec(c1, c3);
|
||||
for (; K; K--) {
|
||||
c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2;
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_2x1(const double *sa, const double *sb, double *C,
|
||||
BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
|
||||
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
|
||||
c1 = update_ec(c1, a1, b1);
|
||||
c2 = update_ec(c2, a2, b1);
|
||||
c3 = update_ec(c3, a3, b2);
|
||||
c4 = update_ec(c4, a4, b2);
|
||||
}
|
||||
c1 = add_ec(c1, c3);
|
||||
c2 = add_ec(c2, c4);
|
||||
if (K) {
|
||||
float64x2_t b1 = vld1q_f64(sb);
|
||||
c1 = update_ec(c1, vld1q_f64(sa), b1);
|
||||
c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha);
|
||||
store_1c(C + 2, c2, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_1x2(const double *sa, const double *sb, double *C,
|
||||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init();
|
||||
|
||||
for (; K > 1; K -= 2) {
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
|
||||
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
|
||||
c1 = update_ec(c1, a1, b1);
|
||||
c2 = update_ec(c2, a1, b2);
|
||||
c3 = update_ec(c3, a2, b3);
|
||||
c4 = update_ec(c4, a2, b4);
|
||||
}
|
||||
c1 = add_ec(c1, c3);
|
||||
c2 = add_ec(c2, c4);
|
||||
if (K) {
|
||||
float64x2_t a1 = vld1q_f64(sa);
|
||||
c1 = update_ec(c1, a1, vld1q_f64(sb));
|
||||
c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha);
|
||||
store_1c(C + LDC * 2, c2, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_2x2(const double *sa, const double *sb, double *C,
|
||||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init();
|
||||
|
||||
for (; K; K--) {
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
|
||||
c1 = update_ec(c1, a1, b1);
|
||||
c2 = update_ec(c2, a2, b1);
|
||||
c3 = update_ec(c3, a1, b2);
|
||||
c4 = update_ec(c4, a2, b2);
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha);
|
||||
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c3, expanded_alpha);
|
||||
store_1c(C + 2, c4, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_4x1(const double *sa, const double *sb, double *C,
|
||||
BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init();
|
||||
pref_c_4(C);
|
||||
|
||||
for (; K; K--) {
|
||||
float64x2_t b1 = vld1q_f64(sb); sb += 2;
|
||||
c1 = update_ec(c1, vld1q_f64(sa), b1);
|
||||
c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
|
||||
c3 = update_ec(c3, vld1q_f64(sa + 4), b1);
|
||||
c4 = update_ec(c4, vld1q_f64(sa + 6), b1);
|
||||
sa += 8;
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha);
|
||||
store_1c(C + 2, c2, expanded_alpha);
|
||||
store_1c(C + 4, c3, expanded_alpha);
|
||||
store_1c(C + 6, c4, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_4x2(const double *sa, const double *sb, double *C,
|
||||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
|
||||
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
|
||||
pref_c_4(C);
|
||||
pref_c_4(C + LDC * 2);
|
||||
|
||||
for (; K; K--) {
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
|
||||
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
|
||||
c1 = update_ec(c1, a1, b1);
|
||||
c2 = update_ec(c2, a2, b1);
|
||||
c3 = update_ec(c3, a3, b1);
|
||||
c4 = update_ec(c4, a4, b1);
|
||||
c5 = update_ec(c5, a1, b2);
|
||||
c6 = update_ec(c6, a2, b2);
|
||||
c7 = update_ec(c7, a3, b2);
|
||||
c8 = update_ec(c8, a4, b2);
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha);
|
||||
store_1c(C + 2, c2, expanded_alpha);
|
||||
store_1c(C + 4, c3, expanded_alpha);
|
||||
store_1c(C + 6, c4, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c5, expanded_alpha);
|
||||
store_1c(C + 2, c6, expanded_alpha);
|
||||
store_1c(C + 4, c7, expanded_alpha);
|
||||
store_1c(C + 6, c8, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_1x4(const double *sa, const double *sb, double *C,
|
||||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4;
|
||||
c1 = c2 = c3 = c4 = init();
|
||||
|
||||
for (; K; K--) {
|
||||
float64x2_t a1 = vld1q_f64(sa); sa += 2;
|
||||
c1 = update_ec(c1, a1, vld1q_f64(sb));
|
||||
c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
|
||||
c3 = update_ec(c3, a1, vld1q_f64(sb + 4));
|
||||
c4 = update_ec(c4, a1, vld1q_f64(sb + 6));
|
||||
sb += 8;
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c2, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c3, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c4, expanded_alpha);
|
||||
}
|
||||
|
||||
static inline void kernel_2x4(const double *sa, const double *sb, double *C,
|
||||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
|
||||
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
|
||||
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
|
||||
|
||||
for (; K; K--) {
|
||||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
|
||||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
|
||||
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
|
||||
c1 = update_ec(c1, a1, b1);
|
||||
c2 = update_ec(c2, a2, b1);
|
||||
c3 = update_ec(c3, a1, b2);
|
||||
c4 = update_ec(c4, a2, b2);
|
||||
c5 = update_ec(c5, a1, b3);
|
||||
c6 = update_ec(c6, a2, b3);
|
||||
c7 = update_ec(c7, a1, b4);
|
||||
c8 = update_ec(c8, a2, b4);
|
||||
}
|
||||
store_1c(C, c1, expanded_alpha);
|
||||
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c3, expanded_alpha);
|
||||
store_1c(C + 2, c4, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c5, expanded_alpha);
|
||||
store_1c(C + 2, c6, expanded_alpha); C += LDC * 2;
|
||||
store_1c(C, c7, expanded_alpha);
|
||||
store_1c(C + 2, c8, expanded_alpha);
|
||||
}
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define FMLA_RI "fmla "
|
||||
#define FMLA_IR "fmla "
|
||||
#define FMLA_II "fmls "
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define FMLA_RI "fmls "
|
||||
#define FMLA_IR "fmla "
|
||||
#define FMLA_II "fmla "
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define FMLA_RI "fmla "
|
||||
#define FMLA_IR "fmls "
|
||||
#define FMLA_II "fmla "
|
||||
#else
|
||||
#define FMLA_RI "fmls "
|
||||
#define FMLA_IR "fmls "
|
||||
#define FMLA_II "fmls "
|
||||
#endif
|
||||
#define FMLA_RR "fmla "
|
||||
|
||||
static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i,
|
||||
float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) {
|
||||
float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4);
|
||||
up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar);
|
||||
up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai);
|
||||
lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar);
|
||||
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai);
|
||||
up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai);
|
||||
up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar);
|
||||
lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai);
|
||||
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar);
|
||||
vst2q_f64(C, up);
|
||||
vst2q_f64(C + 4, lo);
|
||||
}
|
||||
|
||||
static inline void kernel_4x4(const double *sa, const double *sb, double *C,
|
||||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
|
||||
|
||||
float64x2_t c1r, c1i, c2r, c2i;
|
||||
float64x2_t c3r, c3i, c4r, c4i;
|
||||
float64x2_t c5r, c5i, c6r, c6i;
|
||||
float64x2_t c7r, c7i, c8r, c8i;
|
||||
|
||||
const double *pref_ = C;
|
||||
pref_c_4(pref_); pref_ += LDC * 2;
|
||||
pref_c_4(pref_); pref_ += LDC * 2;
|
||||
pref_c_4(pref_); pref_ += LDC * 2;
|
||||
pref_c_4(pref_);
|
||||
|
||||
__asm__ __volatile__(
|
||||
"cmp %[K],#0\n\t"
|
||||
"movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t"
|
||||
"movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t"
|
||||
"movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t"
|
||||
"movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
|
||||
"beq 4f; cmp %[K],#2\n\t"
|
||||
"ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t"
|
||||
"ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t"
|
||||
"ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t"
|
||||
"beq 2f; blt 3f\n\t"
|
||||
"1:\n\t"
|
||||
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
|
||||
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
|
||||
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
|
||||
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
|
||||
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
|
||||
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
|
||||
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
|
||||
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
|
||||
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
|
||||
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
|
||||
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
|
||||
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
|
||||
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
|
||||
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
|
||||
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
|
||||
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
|
||||
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
|
||||
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
|
||||
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
|
||||
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
|
||||
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
|
||||
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
|
||||
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
|
||||
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
|
||||
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
|
||||
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
|
||||
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
|
||||
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
|
||||
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
|
||||
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
|
||||
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
|
||||
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
|
||||
"fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t"
|
||||
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t"
|
||||
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
|
||||
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
|
||||
"fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t"
|
||||
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t"
|
||||
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
|
||||
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
|
||||
"fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t"
|
||||
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t"
|
||||
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
|
||||
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
|
||||
"fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t"
|
||||
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t"
|
||||
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
|
||||
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
|
||||
"fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t"
|
||||
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t"
|
||||
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
|
||||
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
|
||||
"fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t"
|
||||
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t"
|
||||
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
|
||||
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
|
||||
"fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t"
|
||||
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t"
|
||||
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
|
||||
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
|
||||
"fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t"
|
||||
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t"
|
||||
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
|
||||
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t"
|
||||
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
|
||||
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t"
|
||||
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
|
||||
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t"
|
||||
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
|
||||
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t"
|
||||
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
|
||||
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t"
|
||||
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
|
||||
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t"
|
||||
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
|
||||
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
|
||||
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
|
||||
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t"
|
||||
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
|
||||
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t"
|
||||
"2:\n\t"
|
||||
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
|
||||
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
|
||||
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
|
||||
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
|
||||
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
|
||||
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
|
||||
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
|
||||
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
|
||||
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
|
||||
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
|
||||
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
|
||||
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
|
||||
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
|
||||
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
|
||||
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
|
||||
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
|
||||
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
|
||||
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
|
||||
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
|
||||
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
|
||||
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
|
||||
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
|
||||
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
|
||||
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
|
||||
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
|
||||
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
|
||||
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
|
||||
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
|
||||
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
|
||||
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
|
||||
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
|
||||
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
|
||||
"fmov v15.d[1],x0\n\t"
|
||||
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
|
||||
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
|
||||
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
|
||||
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
|
||||
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
|
||||
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
|
||||
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
|
||||
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
|
||||
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
|
||||
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t"
|
||||
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
|
||||
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
|
||||
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t"
|
||||
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
|
||||
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
|
||||
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t"
|
||||
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
|
||||
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
|
||||
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t"
|
||||
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
|
||||
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
|
||||
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t"
|
||||
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
|
||||
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t"
|
||||
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
|
||||
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t"
|
||||
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
|
||||
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t"
|
||||
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
|
||||
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t"
|
||||
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
|
||||
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t"
|
||||
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
|
||||
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t"
|
||||
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
|
||||
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
|
||||
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
|
||||
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t"
|
||||
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
|
||||
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t"
|
||||
"3:\n\t"
|
||||
"fmov v7.d[1],x0\n\t"
|
||||
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t"
|
||||
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
|
||||
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
|
||||
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t"
|
||||
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
|
||||
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
|
||||
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t"
|
||||
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
|
||||
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
|
||||
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t"
|
||||
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
|
||||
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
|
||||
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t"
|
||||
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
|
||||
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
|
||||
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t"
|
||||
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
|
||||
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
|
||||
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t"
|
||||
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
|
||||
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
|
||||
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t"
|
||||
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
|
||||
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
|
||||
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
|
||||
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
|
||||
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
|
||||
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
|
||||
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
|
||||
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
|
||||
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
|
||||
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t"
|
||||
"4:\n\t"
|
||||
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
|
||||
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
|
||||
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
|
||||
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
|
||||
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb)
|
||||
::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
|
||||
|
||||
store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2;
|
||||
store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2;
|
||||
store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2;
|
||||
store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
|
||||
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
|
||||
|
||||
BLASLONG n_left = N;
|
||||
for (; n_left >= 4; n_left -= 4) {
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
BLASLONG m_left = M;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai);
|
||||
a_ += 8 * K;
|
||||
c_ += 8;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai);
|
||||
a_ += 4 * K;
|
||||
c_ += 4;
|
||||
}
|
||||
if (m_left) {
|
||||
kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai);
|
||||
}
|
||||
sb += 8 * K;
|
||||
C += 8 * LDC;
|
||||
}
|
||||
if (n_left >= 2) {
|
||||
n_left -= 2;
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
BLASLONG m_left = M;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai);
|
||||
a_ += 8 * K;
|
||||
c_ += 8;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai);
|
||||
a_ += 4 * K;
|
||||
c_ += 4;
|
||||
}
|
||||
if (m_left) {
|
||||
kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai);
|
||||
}
|
||||
sb += 4 * K;
|
||||
C += 4 * LDC;
|
||||
}
|
||||
if (n_left) {
|
||||
const FLOAT *a_ = sa;
|
||||
FLOAT *c_ = C;
|
||||
BLASLONG m_left = M;
|
||||
for (; m_left >= 4; m_left -= 4) {
|
||||
kernel_4x1(a_, sb, c_, K, alphar, alphai);
|
||||
a_ += 8 * K;
|
||||
c_ += 8;
|
||||
}
|
||||
if (m_left >= 2) {
|
||||
m_left -= 2;
|
||||
kernel_2x1(a_, sb, c_, K, alphar, alphai);
|
||||
a_ += 4 * K;
|
||||
c_ += 4;
|
||||
}
|
||||
if (m_left) {
|
||||
kernel_1x1(a_, sb, c_, K, alphar, alphai);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Pure C for other kernels
|
||||
SAMAXKERNEL = ../mips/amax.c
|
||||
DAMAXKERNEL = ../mips/amax.c
|
||||
CAMAXKERNEL = ../mips/zamax.c
|
||||
ZAMAXKERNEL = ../mips/zamax.c
|
||||
|
||||
SAMINKERNEL = ../mips/amin.c
|
||||
DAMINKERNEL = ../mips/amin.c
|
||||
CAMINKERNEL = ../mips/zamin.c
|
||||
ZAMINKERNEL = ../mips/zamin.c
|
||||
|
||||
SMAXKERNEL = ../mips/max.c
|
||||
DMAXKERNEL = ../mips/max.c
|
||||
|
||||
SMINKERNEL = ../mips/min.c
|
||||
DMINKERNEL = ../mips/min.c
|
||||
|
||||
ISAMAXKERNEL = ../mips/iamax.c
|
||||
IDAMAXKERNEL = ../mips/iamax.c
|
||||
ICAMAXKERNEL = ../mips/izamax.c
|
||||
IZAMAXKERNEL = ../mips/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../mips/iamin.c
|
||||
IDAMINKERNEL = ../mips/iamin.c
|
||||
ICAMINKERNEL = ../mips/izamin.c
|
||||
IZAMINKERNEL = ../mips/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../mips/imax.c
|
||||
IDMAXKERNEL = ../mips/imax.c
|
||||
|
||||
ISMINKERNEL = ../mips/imin.c
|
||||
IDMINKERNEL = ../mips/imin.c
|
||||
|
||||
SASUMKERNEL = ../mips/asum.c
|
||||
DASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/zasum.c
|
||||
ZASUMKERNEL = ../mips/zasum.c
|
||||
|
||||
SSUMKERNEL = ../mips/sum.c
|
||||
DSUMKERNEL = ../mips/sum.c
|
||||
CSUMKERNEL = ../mips/zsum.c
|
||||
ZSUMKERNEL = ../mips/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../mips/axpy.c
|
||||
DAXPYKERNEL = ../mips/axpy.c
|
||||
CAXPYKERNEL = ../mips/zaxpy.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../mips/copy.c
|
||||
DCOPYKERNEL = ../mips/copy.c
|
||||
CCOPYKERNEL = ../mips/zcopy.c
|
||||
ZCOPYKERNEL = ../mips/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../mips/dot.c
|
||||
DDOTKERNEL = ../mips/dot.c
|
||||
CDOTKERNEL = ../mips/zdot.c
|
||||
ZDOTKERNEL = ../mips/zdot.c
|
||||
|
||||
SNRM2KERNEL = ../mips/nrm2.c
|
||||
DNRM2KERNEL = ../mips/nrm2.c
|
||||
CNRM2KERNEL = ../mips/znrm2.c
|
||||
ZNRM2KERNEL = ../mips/znrm2.c
|
||||
|
||||
SROTKERNEL = ../mips/rot.c
|
||||
DROTKERNEL = ../mips/rot.c
|
||||
CROTKERNEL = ../mips/zrot.c
|
||||
ZROTKERNEL = ../mips/zrot.c
|
||||
|
||||
SSCALKERNEL = ../mips/scal.c
|
||||
DSCALKERNEL = ../mips/scal.c
|
||||
CSCALKERNEL = ../mips/zscal.c
|
||||
ZSCALKERNEL = ../mips/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../mips/swap.c
|
||||
DSWAPKERNEL = ../mips/swap.c
|
||||
CSWAPKERNEL = ../mips/zswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../mips/gemv_n.c
|
||||
DGEMVNKERNEL = ../mips/gemv_n.c
|
||||
CGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../mips/gemv_t.c
|
||||
DGEMVTKERNEL = ../mips/gemv_t.c
|
||||
CGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
|
||||
SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
ifeq ($(HAVE_GAS), 1)
|
||||
include $(KERNELDIR)/KERNEL.POWER8
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
|
@ -33,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
|
||||
SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c
|
||||
SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c
|
||||
SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c
|
||||
SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c
|
||||
SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c
|
||||
SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c
|
||||
SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c
|
||||
SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_power10.c
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
|
|
@ -43,7 +52,18 @@ DGEMMITCOPYOBJ =
|
|||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
|
||||
DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c
|
||||
DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c
|
||||
DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c
|
||||
DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c
|
||||
DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c
|
||||
DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
|
||||
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
|
||||
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_power10.S
|
||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
|
|
@ -218,5 +238,4 @@ QCABS_KERNEL = ../generic/cabs.c
|
|||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
#endif
|
||||
const float *mvecp = mvec;
|
||||
/* We have to load reverse mask for big endian. */
|
||||
/* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
__vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
|
||||
#else
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
#endif
|
||||
|
||||
long ytmp;
|
||||
|
||||
__asm__
|
||||
|
|
@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%4) \n\t"
|
||||
"stxv 49, 16(%4) \n\t"
|
||||
"stxv 50, 32(%4) \n\t"
|
||||
"stxv 51, 48(%4) \n\t"
|
||||
"stxv 34, 64(%4) \n\t"
|
||||
"stxv 35, 80(%4) \n\t"
|
||||
"stxv 38, 96(%4) \n\t"
|
||||
"stxv 39, 112(%4) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
|
|
@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
#endif
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
|
||||
|
|
@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%4) \n\t"
|
||||
"stxv 49, 16(%4) \n\t"
|
||||
"stxv 50, 32(%4) \n\t"
|
||||
"stxv 51, 48(%4) \n\t"
|
||||
"stxv 34, 64(%4) \n\t"
|
||||
"stxv 35, 80(%4) \n\t"
|
||||
"stxv 38, 96(%4) \n\t"
|
||||
"stxv 39, 112(%4) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
|
|
@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
#endif
|
||||
|
||||
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
|
||||
:
|
||||
|
|
|
|||
|
|
@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
|
|
@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
#else
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
|
|
@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
|
|
@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
|
|
@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
|
||||
#endif
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
|
||||
#include "common.h"
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
#include "cdot_microk_power10.c"
|
||||
#else
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
|
@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
BLASLONG n1 = n & -16;
|
||||
#else
|
||||
BLASLONG n1 = n & -8;
|
||||
|
|
|
|||
|
|
@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
|
||||
{
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
__vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
#else
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
#endif
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
|
@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
|
|||
"xxswapd 33, 34 \n\t"
|
||||
"xvaddsp 35, 35, 32 \n\t"
|
||||
"xvaddsp 34, 34, 33 \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xxpermdi 34, 35, 34, 0 \n\t"
|
||||
#else
|
||||
"xxpermdi 34, 34, 35, 2 \n\t"
|
||||
#endif
|
||||
"stxv 34, 0(%6) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
|
||||
|
|
|
|||
|
|
@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "cgemm_macros_power10.S"
|
||||
|
||||
#if (_AIX)
|
||||
.set perm_const1, 0x0405060700010203
|
||||
.set perm_const2, 0x0c0d0e0f08090a0b
|
||||
.set save_permute_12, 0x1011121300010203
|
||||
.set save_permute_11, 0x18191a1b08090a0b
|
||||
#else
|
||||
.equ perm_const1, 0x0405060700010203
|
||||
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||
.equ save_permute_11, 0x0405060714151617
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
|
@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*load reverse permute mask for big endian
|
||||
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||
*/
|
||||
|
||||
#if (_AIX)
|
||||
lis T2, (perm_const2>>48 & 0xFFFF)
|
||||
lis T1, (perm_const1>>48 & 0xFFFF)
|
||||
lis T3, (save_permute_12>>48 & 0xFFFF)
|
||||
lis T4, (save_permute_11>>48 & 0xFFFF)
|
||||
|
||||
ori T2, T2, (perm_const2>>32 & 0xFFFF)
|
||||
ori T1, T1, (perm_const1>>32 & 0xFFFF)
|
||||
ori T3, T3, (save_permute_12>>32 & 0xFFFF)
|
||||
ori T4, T4, (save_permute_11>>32 & 0xFFFF)
|
||||
#else
|
||||
lis T2, perm_const2@highest
|
||||
lis T1, perm_const1@highest
|
||||
lis T3, save_permute_12@highest
|
||||
lis T4, save_permute_11@highest
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@higher
|
||||
ori T1, T1, perm_const1@higher
|
||||
ori T3, T3, save_permute_12@higher
|
||||
ori T4, T4, save_permute_11@higher
|
||||
|
||||
#endif
|
||||
|
||||
rldicr T2, T2, 32, 31
|
||||
rldicr T1, T1, 32, 31
|
||||
rldicr T3, T3, 32, 31
|
||||
rldicr T4, T4, 32, 31
|
||||
|
||||
#if (_AIX)
|
||||
oris T2, T2, (perm_const2>>16 & 0xFFFF)
|
||||
oris T1, T1, (perm_const1>>16 & 0xFFFF)
|
||||
oris T3, T3, (save_permute_12>>16 & 0xFFFF)
|
||||
oris T4, T4, (save_permute_11>>16 & 0xFFFF)
|
||||
|
||||
ori T2, T2, (perm_const2 & 0xFFFF)
|
||||
ori T1, T1, (perm_const1 & 0xFFFF)
|
||||
ori T3, T3, (save_permute_12 & 0xFFFF)
|
||||
ori T4, T4, (save_permute_11 & 0xFFFF)
|
||||
#else
|
||||
oris T2, T2, perm_const2@h
|
||||
oris T1, T1, perm_const1@h
|
||||
oris T3, T3, save_permute_12@h
|
||||
|
|
@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ori T1, T1, perm_const1@l
|
||||
ori T3, T3, save_permute_12@l
|
||||
ori T4, T4, save_permute_11@l
|
||||
|
||||
#endif
|
||||
|
||||
li r0,0
|
||||
li PRE,512
|
||||
|
|
|
|||
|
|
@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 36, 34
|
||||
xvf32gerpp 2, 37, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 7, 36, 35
|
||||
xvf32gerpp 6, 37, 35
|
||||
xvf32gerpp 5, 32, 35
|
||||
xvf32gerpp 4, 33, 35
|
||||
#else
|
||||
xvf32gerpp 3, 36, 35
|
||||
xvf32gerpp 2, 37, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
|
|
@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 6, 37, 34
|
||||
xvf32gerpp 5, 32, 34
|
||||
xvf32gerpp 4, 33, 34
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x8_2
|
||||
|
|
@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xvf32gerpp 3, 36, 34
|
||||
xvf32gerpp 2, 37, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 7, 36, 35
|
||||
xvf32gerpp 6, 37, 35
|
||||
xvf32gerpp 5, 32, 35
|
||||
xvf32gerpp 4, 33, 35
|
||||
#else
|
||||
xvf32gerpp 3, 36, 35
|
||||
xvf32gerpp 2, 37, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
|
|
@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 6, 37, 34
|
||||
xvf32gerpp 5, 32, 34
|
||||
xvf32gerpp 4, 33, 34
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
|
||||
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
|
||||
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xvf32gerpp 3, 42, 38
|
||||
xvf32gerpp 2, 43, 38
|
||||
xvf32gerpp 1, 40, 38
|
||||
xvf32gerpp 0, 41, 38
|
||||
xvf32gerpp 7, 42, 39
|
||||
xvf32gerpp 6, 43, 39
|
||||
xvf32gerpp 5, 40, 39
|
||||
xvf32gerpp 4, 41, 39
|
||||
#else
|
||||
xvf32gerpp 3, 42, 39
|
||||
xvf32gerpp 2, 43, 39
|
||||
xvf32gerpp 1, 40, 39
|
||||
|
|
@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 6, 43, 38
|
||||
xvf32gerpp 5, 40, 38
|
||||
xvf32gerpp 4, 41, 38
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
|
||||
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
|
|
@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs5, vs12, vs4, 2
|
||||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs26, vs26, vs7
|
||||
xvaddsp vs27, vs27, vs5
|
||||
xvaddsp vs28, vs28, vs11
|
||||
xvaddsp vs29, vs29, vs9
|
||||
xvaddsp vs30, vs30, vs15
|
||||
xvaddsp vs31, vs31, vs13
|
||||
#else
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs4, vs12, 1
|
||||
xxpermdi vs26, vs6, vs14, 1
|
||||
xxpermdi vs29, vs8, vs0, 1
|
||||
xxpermdi vs28, vs10, vs2, 1
|
||||
xxpermdi vs31, vs12, vs4, 1
|
||||
xxpermdi vs30, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
|
|
@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs28, vs2, vs10, 2
|
||||
xxpermdi vs31, vs4, vs12, 2
|
||||
xxpermdi vs30, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
MULT_APLHA_PART1 vs48, vs56, vs0, vs1
|
||||
|
|
@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs5, vs12, vs4, 2
|
||||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs32, vs32, vs3
|
||||
xvaddsp vs33, vs33, vs1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs40, vs40, vs7
|
||||
xvaddsp vs41, vs41, vs5
|
||||
xvaddsp vs34, vs34, vs11
|
||||
xvaddsp vs35, vs35, vs9
|
||||
xvaddsp vs42, vs42, vs15
|
||||
xvaddsp vs43, vs43, vs13
|
||||
#else
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xxpermdi vs33, vs0, vs8, 1
|
||||
xxpermdi vs32, vs2, vs10, 1
|
||||
xxpermdi vs41, vs4, vs12, 1
|
||||
xxpermdi vs40, vs6, vs14, 1
|
||||
xxpermdi vs35, vs8, vs0, 1
|
||||
xxpermdi vs34, vs10, vs2, 1
|
||||
xxpermdi vs43, vs12, vs4, 1
|
||||
xxpermdi vs42, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs33, vs8, vs0, 2
|
||||
xxpermdi vs32, vs10, vs2, 2
|
||||
|
|
@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs34, vs2, vs10, 2
|
||||
xxpermdi vs43, vs4, vs12, 2
|
||||
xxpermdi vs42, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs32, 0(T2)
|
||||
stxvp vs40, 32(T2)
|
||||
|
|
@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 32, 35
|
||||
xvf32gerpp 2, 33, 35
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
#else
|
||||
xvf32gerpp 3, 32, 34
|
||||
xvf32gerpp 2, 33, 34
|
||||
xvf32gerpp 1, 32, 35
|
||||
xvf32gerpp 0, 33, 35
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x4_2
|
||||
|
|
@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 32, 35
|
||||
xvf32gerpp 2, 33, 35
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
#else
|
||||
xvf32gerpp 3, 32, 34
|
||||
xvf32gerpp 2, 33, 34
|
||||
xvf32gerpp 1, 32, 35
|
||||
xvf32gerpp 0, 33, 35
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
|
||||
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 36, 39
|
||||
xvf32gerpp 2, 37, 39
|
||||
xvf32gerpp 1, 36, 38
|
||||
xvf32gerpp 0, 37, 38
|
||||
#else
|
||||
xvf32gerpp 3, 36, 38
|
||||
xvf32gerpp 2, 37, 38
|
||||
xvf32gerpp 1, 36, 39
|
||||
xvf32gerpp 0, 37, 39
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
|
||||
|
|
@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
|
|
@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
xvaddsp vs26, vs26, vs11
|
||||
|
|
@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvaddsp vs29, vs29, vs5
|
||||
xvaddsp vs30, vs30, vs15
|
||||
xvaddsp vs31, vs31, vs13
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs8, vs0, 1
|
||||
xxpermdi vs26, vs10, vs2, 1
|
||||
xxpermdi vs29, vs4, vs12, 1
|
||||
xxpermdi vs28, vs6, vs14, 1
|
||||
xxpermdi vs31, vs12, vs4, 1
|
||||
xxpermdi vs30, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
|
|
@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs28, vs14, vs6, 2
|
||||
xxpermdi vs31, vs4, vs12, 2
|
||||
xxpermdi vs30, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
stxvp vs26, 0(T1)
|
||||
|
|
@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 1, 35, 32
|
||||
xvf32gerpp 0, 34, 32
|
||||
#else
|
||||
xvf32gerpp 1, 34, 32
|
||||
xvf32gerpp 0, 35, 32
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x2_2
|
||||
|
|
@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 1, 35, 32
|
||||
xvf32gerpp 0, 34, 32
|
||||
#else
|
||||
xvf32gerpp 1, 34, 33
|
||||
xvf32gerpp 0, 35, 33
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 1, 37, 33
|
||||
xvf32gerpp 0, 36, 33
|
||||
#else
|
||||
xvf32gerpp 1, 36, 32
|
||||
xvf32gerpp 0, 37, 32
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
|
|
@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR1
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 0
|
||||
xxpermdi vs9, vs2, vs10, 0
|
||||
xxpermdi vs3, vs8, vs0, 3
|
||||
xxpermdi vs11, vs10, vs2, 3
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 0
|
||||
xxpermdi vs9, vs10, vs2, 0
|
||||
xxpermdi vs3, vs0, vs8, 3
|
||||
xxpermdi vs11, vs2, vs10, 3
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs1
|
||||
xvaddsp vs26, vs26, vs9
|
||||
xvaddsp vs25, vs25, vs3
|
||||
xvaddsp vs27, vs27, vs11
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs24, vs0, vs8, 0
|
||||
xxpermdi vs26, vs2, vs10, 0
|
||||
xxpermdi vs25, vs8, vs0, 3
|
||||
xxpermdi vs27, vs10, vs2, 3
|
||||
#else
|
||||
xxpermdi vs24, vs8, vs0, 0
|
||||
xxpermdi vs26, vs10, vs2, 0
|
||||
xxpermdi vs25, vs0, vs8, 3
|
||||
xxpermdi vs27, vs2, vs10, 3
|
||||
#endif
|
||||
#endif
|
||||
stxv vs24, 0(CO)
|
||||
stxv vs25, 0(T1)
|
||||
|
|
@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 34, 32
|
||||
xvf32gerpp 1, 35, 32
|
||||
#else
|
||||
xvf32gerpp 0, 35, 32
|
||||
xvf32gerpp 1, 34, 32
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x1_2
|
||||
|
|
@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro LOAD4x1_2O OffsetA, OffsetB
|
||||
lxv vs32, (\OffsetA)(AO)
|
||||
vspltisb v6, 0
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs33, vs32, vs38, 2
|
||||
xxpermdi vs32, vs32, vs38, 0
|
||||
#else
|
||||
xxpermdi vs33, vs32, vs38, 0
|
||||
xxpermdi vs32, vs32, vs38, 2
|
||||
#endif
|
||||
lxvp vs34, (0+\OffsetB)(BO)
|
||||
lxvp vs36, (32+\OffsetB)(BO)
|
||||
.endm
|
||||
|
|
@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 34, 32
|
||||
xvf32gerpp 1, 35, 32
|
||||
#else
|
||||
xvf32gerpp 0, 35, 32
|
||||
xvf32gerpp 1, 34, 32
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 36, 33
|
||||
xvf32gerpp 1, 37, 33
|
||||
#else
|
||||
xvf32gerpp 0, 37, 33
|
||||
xvf32gerpp 1, 36, 33
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs33, vs32, vs38, 2
|
||||
xxpermdi vs32, vs32, vs38, 0
|
||||
#else
|
||||
xxpermdi vs33, vs32, vs38, 0
|
||||
xxpermdi vs32, vs32, vs38, 2
|
||||
#endif
|
||||
.endif
|
||||
.if \IsLast==1
|
||||
.if \Complete==1
|
||||
|
|
@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 2, 37, 34
|
||||
xvf32gerpp 3, 36, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
#else
|
||||
xvf32gerpp 2, 37, 35
|
||||
xvf32gerpp 3, 36, 35
|
||||
xvf32gerpp 0, 33, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
#endif
|
||||
|
||||
.if \Complete==0
|
||||
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
|
||||
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 2, 41, 35
|
||||
xvf32gerpp 3, 40, 35
|
||||
xvf32gerpp 0, 39, 35
|
||||
xvf32gerpp 1, 38, 35
|
||||
#else
|
||||
xvf32gerpp 2, 41, 34
|
||||
xvf32gerpp 3, 40, 34
|
||||
xvf32gerpp 0, 39, 34
|
||||
xvf32gerpp 1, 38, 34
|
||||
#endif
|
||||
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
|
||||
|
|
@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs5, vs12, vs4, 2
|
||||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs26, vs26, vs7
|
||||
xvaddsp vs27, vs27, vs5
|
||||
xvaddsp vs28, vs28, vs11
|
||||
|
|
@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvaddsp vs30, vs30, vs15
|
||||
xvaddsp vs31, vs31, vs13
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs4, vs12, 1
|
||||
xxpermdi vs26, vs6, vs14, 1
|
||||
xxpermdi vs29, vs8, vs0, 1
|
||||
xxpermdi vs28, vs10, vs2, 1
|
||||
xxpermdi vs31, vs12, vs4, 1
|
||||
xxpermdi vs30, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
xxpermdi vs27, vs12, vs4, 2
|
||||
|
|
@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs28, vs2, vs10, 2
|
||||
xxpermdi vs31, vs4, vs12, 2
|
||||
xxpermdi vs30, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
stxvp vs26, 32(CO)
|
||||
|
|
@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
#else
|
||||
xvf32gerpp 0, 33, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 37, 35
|
||||
xvf32gerpp 1, 36, 35
|
||||
#else
|
||||
xvf32gerpp 0, 37, 34
|
||||
xvf32gerpp 1, 36, 34
|
||||
#endif
|
||||
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
|
||||
|
|
@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR1
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
xvaddsp vs26, vs26, vs11
|
||||
xvaddsp vs27, vs27, vs9
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs8, vs0, 1
|
||||
xxpermdi vs26, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
xxpermdi vs27, vs0, vs8, 2
|
||||
xxpermdi vs26, vs2, vs10, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
stxvp vs26, 0(T1)
|
||||
|
|
@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxperm vs8, vs9, save_permute_1
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 0
|
||||
xxpermdi vs9, vs8, vs0, 3
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 0
|
||||
xxpermdi vs9, vs0, vs8, 3
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs1
|
||||
xvaddsp vs26, vs26, vs9
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs24, vs0, vs8, 0
|
||||
xxpermdi vs26, vs8, vs0, 3
|
||||
#else
|
||||
xxpermdi vs24, vs8, vs0, 0
|
||||
xxpermdi vs26, vs0, vs8, 3
|
||||
#endif
|
||||
#endif
|
||||
stxv vs24, 0(CO)
|
||||
stxv vs26, 0(T1)
|
||||
|
|
@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvp vs32, (0+\OffsetA)(AO)
|
||||
lxvp vs36, (32+\OffsetA)(AO)
|
||||
vspltisb v10, 0
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs42, 2
|
||||
xxpermdi vs34, vs34, vs42, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs42, 0
|
||||
xxpermdi vs34, vs34, vs42, 2
|
||||
#endif
|
||||
lxvp vs38, (64+\OffsetA)(AO)
|
||||
lxvp vs40, (64+32+\OffsetA)(AO)
|
||||
.endm
|
||||
|
|
@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 3, 35, 40
|
||||
.if \Complete==0
|
||||
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs42, 2
|
||||
xxpermdi vs34, vs34, vs42, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs42, 0
|
||||
xxpermdi vs34, vs34, vs42, 2
|
||||
#endif
|
||||
lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
.if \IsLast==1
|
||||
|
|
@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART2 vs34, vs42, vs4, vs5
|
||||
MULT_APLHA_PART2 vs35, vs43, vs6, vs7
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs0, vs1, save_permute_1
|
||||
xxperm vs2, vs3, save_permute_1
|
||||
xxperm vs4, vs5, save_permute_1
|
||||
xxperm vs6, vs7, save_permute_1
|
||||
#else
|
||||
xxperm vs0, vs1, vs28
|
||||
xxperm vs2, vs3, vs28
|
||||
xxperm vs4, vs5, vs28
|
||||
xxperm vs6, vs7, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs24, vs24, vs2
|
||||
|
|
@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvp vs26, 32(CO)
|
||||
#else
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
stxv vs2, 0(CO)
|
||||
stxv vs0, 16(CO)
|
||||
stxv vs6, 32(CO)
|
||||
stxv vs4, 48(CO)
|
||||
#else
|
||||
stxv vs0, 0(CO)
|
||||
stxv vs2, 16(CO)
|
||||
stxv vs4, 32(CO)
|
||||
stxv vs6, 48(CO)
|
||||
#endif
|
||||
#endif
|
||||
addi CO, CO, 64
|
||||
.endm
|
||||
|
|
@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxv vs34, (\OffsetB)(BO)
|
||||
lxvp vs32, (0+\OffsetA)(AO)
|
||||
vspltisb v6, 0
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs38, 2
|
||||
xxpermdi vs34, vs34, vs38, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs38, 0
|
||||
xxpermdi vs34, vs34, vs38, 2
|
||||
#endif
|
||||
lxvp vs36, (32+\OffsetA)(AO)
|
||||
.endm
|
||||
|
||||
|
|
@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 1, 35, 36
|
||||
.if \Complete==0
|
||||
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs38, 2
|
||||
xxpermdi vs34, vs34, vs38, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs38, 0
|
||||
xxpermdi vs34, vs34, vs38, 2
|
||||
#endif
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
.if \IsLast==1
|
||||
|
|
@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
|
||||
MULT_APLHA_PART2 vs33, vs41, vs2, vs3
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs0, vs1, save_permute_1
|
||||
xxperm vs2, vs3, save_permute_1
|
||||
#else
|
||||
xxperm vs0, vs1, vs28
|
||||
xxperm vs2, vs3, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs24, vs24, vs2
|
||||
|
|
@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvp vs24, 0(CO)
|
||||
#else
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
stxv vs2, 0(CO)
|
||||
stxv vs0, 16(CO)
|
||||
#else
|
||||
stxv vs0, 0(CO)
|
||||
stxv vs2, 16(CO)
|
||||
#endif
|
||||
#endif
|
||||
addi CO, CO, 32
|
||||
.endm
|
||||
|
|
@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART1 vs32, vs40, vs0, vs1
|
||||
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs0, vs1, save_permute_1
|
||||
#else
|
||||
xxperm vs0, vs1, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs24, vs24, vs0
|
||||
|
|
@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART1 vs32, vs40, vs37, vs1
|
||||
MULT_APLHA_PART2 vs32, vs40, vs37, vs1
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs37, vs1, save_permute_1
|
||||
#else
|
||||
xxperm vs37, vs1, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs36, vs36, vs37
|
||||
|
|
|
|||
|
|
@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
|
||||
{
|
||||
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
__vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
|
||||
#else
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
#endif
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
|
|
|||
|
|
@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "cswap_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "cswap_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "cswap_microk_power8.c"
|
||||
#include "cswap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dasum_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "dasum_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dasum_microk_power8.c"
|
||||
#include "dasum_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
|
||||
|
|
@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32)
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,923 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
|
||||
#if !__has_builtin(__builtin_vsx_assemble_pair)
|
||||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
|
||||
#endif
|
||||
|
||||
#if !defined(B0)
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_4x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_2x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
rc0 = vec_xl(0, C+((N)*ldc)+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result = vec_madd(result, valpha, rc0); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#else
|
||||
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_4x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_2x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#endif
|
||||
|
||||
#define INIT_8ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3); \
|
||||
__builtin_mma_xxsetaccz(&acc4); \
|
||||
__builtin_mma_xxsetaccz(&acc5); \
|
||||
__builtin_mma_xxsetaccz(&acc6); \
|
||||
__builtin_mma_xxsetaccz(&acc7);
|
||||
|
||||
#define INIT_4ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3);
|
||||
|
||||
#define INIT_2ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1);
|
||||
|
||||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
|
||||
|
||||
#if (defined(__GNUC__) && (__GNUC__ == 10))
|
||||
#if defined(_AIX)
|
||||
#define LOAD_PAIR(pair, v0, v1) \
|
||||
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
|
||||
#else
|
||||
#define LOAD_PAIR(pair, v0, v1) \
|
||||
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
|
||||
#endif
|
||||
#else
|
||||
#define LOAD_PAIR(pair, v0, v1) \
|
||||
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
|
||||
#endif
|
||||
|
||||
#define LOAD_A_1x8(K, M) \
|
||||
ra0 = vec_xl(0, A+((K)*lda)+M+0); \
|
||||
ra1 = vec_xl(0, A+((K)*lda)+M+2); \
|
||||
ra2 = vec_xl(0, A+((K)*lda)+M+4); \
|
||||
ra3 = vec_xl(0, A+((K)*lda)+M+6);
|
||||
|
||||
#define LOAD_A_1x4(K, M) \
|
||||
ra0 = vec_xl(0, A+((K)*lda)+M+0); \
|
||||
ra1 = vec_xl(0, A+((K)*lda)+M+2); \
|
||||
|
||||
#define LOAD_A_1x2(K, M) \
|
||||
ra0 = vec_xl(0, A+((K)*lda)+M+0);
|
||||
|
||||
#define LOAD_A_1x1(K, M) \
|
||||
ra0 = vec_splats(A[((K)*lda)+M+0]);
|
||||
|
||||
#define LOAD_BTP_8x2(N, K) \
|
||||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
|
||||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
|
||||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
|
||||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb0, rb1); \
|
||||
t1 = vec_mergeh(rb2, rb3); \
|
||||
LOAD_PAIR(pb0, t0, t1); \
|
||||
t0 = vec_mergel(rb0, rb1); \
|
||||
t1 = vec_mergel(rb2, rb3); \
|
||||
LOAD_PAIR(pb2, t0, t1); \
|
||||
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
|
||||
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
|
||||
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
|
||||
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb4, rb5); \
|
||||
t1 = vec_mergeh(rb6, rb7); \
|
||||
LOAD_PAIR(pb1, t0, t1); \
|
||||
t0 = vec_mergel(rb4, rb5); \
|
||||
t1 = vec_mergel(rb6, rb7); \
|
||||
LOAD_PAIR(pb3, t0, t1);
|
||||
|
||||
#define LOAD_BTP_8x1(N, K) \
|
||||
rb0 = vec_xor(rb0, rb0); \
|
||||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
|
||||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
|
||||
rb1 = vec_xor(rb1, rb1); \
|
||||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
|
||||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
|
||||
LOAD_PAIR(pb0, rb0, rb1); \
|
||||
rb2 = vec_xor(rb2, rb2); \
|
||||
rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \
|
||||
rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \
|
||||
rb3 = vec_xor(rb3, rb3); \
|
||||
rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \
|
||||
rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \
|
||||
LOAD_PAIR(pb1, rb2, rb3);
|
||||
|
||||
#define LOAD_BTP_4x2(N, K) \
|
||||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
|
||||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
|
||||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
|
||||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb0, rb1); \
|
||||
t1 = vec_mergeh(rb2, rb3); \
|
||||
LOAD_PAIR(pb0, t0, t1); \
|
||||
t0 = vec_mergel(rb0, rb1); \
|
||||
t1 = vec_mergel(rb2, rb3); \
|
||||
LOAD_PAIR(pb1, t0, t1);
|
||||
|
||||
#define LOAD_BTP_4x1(N, K) \
|
||||
rb0 = vec_xor(rb0, rb0); \
|
||||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
|
||||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
|
||||
rb1 = vec_xor(rb1, rb1); \
|
||||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
|
||||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
|
||||
LOAD_PAIR(pb0, rb0, rb1);
|
||||
|
||||
#define LOAD_BTP_2x2(N, K) \
|
||||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
|
||||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb0, rb1); \
|
||||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \
|
||||
t1 = vec_mergel(rb0, rb1); \
|
||||
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
|
||||
|
||||
#define LOAD_BTP_2x1(N, K) \
|
||||
rb0 = vec_xor(rb0, rb0); \
|
||||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
|
||||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
|
||||
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
|
||||
|
||||
#define LOAD_B_1x1(N, K) \
|
||||
rb0 = vec_splats(B[((N)*ldb)+K]);
|
||||
|
||||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
|
||||
a0, a1, a2, a3, a4, a5, a6, a7) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
|
||||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
|
||||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
|
||||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
|
||||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
|
||||
|
||||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
|
||||
|
||||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
|
||||
|
||||
#define KERNEL_MMA_1ACC(b0, a0) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
|
||||
|
||||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1); \
|
||||
result2 = vec_madd(a2, b2, result2); \
|
||||
result3 = vec_madd(a3, b3, result3);
|
||||
|
||||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1);
|
||||
|
||||
#define KERNEL_VMADD_1VSR(a0, b0) \
|
||||
result = vec_madd(a0, b0, result);
|
||||
|
||||
#define PACK_B(pb0, pb1, offset) \
|
||||
*((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \
|
||||
*((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;
|
||||
|
||||
#define LOAD_PACKED_B(pb0, pb1, offset) \
|
||||
pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \
|
||||
pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset)));
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
BLASLONG m, n, k;
|
||||
|
||||
BLASLONG m8 = M & ~7;
|
||||
BLASLONG m4 = M & ~3;
|
||||
BLASLONG m2 = M & ~1;
|
||||
|
||||
BLASLONG n8 = N & ~7;
|
||||
BLASLONG n4 = N & ~3;
|
||||
BLASLONG n2 = N & ~1;
|
||||
|
||||
BLASLONG k2 = K & ~1;
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
|
||||
#else
|
||||
int has_packing = 0;
|
||||
#endif
|
||||
|
||||
double *packB;
|
||||
if (has_packing) packB = (double *)malloc(K*8*sizeof(double));
|
||||
|
||||
vector double valpha = vec_splats(alpha);
|
||||
#if !defined(B0)
|
||||
vector double vbeta = vec_splats(beta);
|
||||
#endif
|
||||
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
for (m = 0; m < m8; m += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
|
||||
INIT_8ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
if (has_packing) {
|
||||
if (m == 0) {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
PACK_B(pb0, pb1, 0);
|
||||
LOAD_A_1x8(k+1, m);
|
||||
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
PACK_B(pb2, pb3, 8);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
PACK_B(pb0, pb1, 0);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
LOAD_A_1x8(k+1, m);
|
||||
LOAD_PACKED_B(pb2, pb3, 8);
|
||||
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
LOAD_A_1x8(k+1, m);
|
||||
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc4, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc6, n+0, m+6);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+4, m+2);
|
||||
SAVE_4x2_ACC(&acc5, n+4, m+4);
|
||||
SAVE_4x2_ACC(&acc7, n+4, m+6);
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
|
||||
LOAD_A_1x4(k+1, m);
|
||||
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
|
||||
LOAD_A_1x4(k+1, m);
|
||||
LOAD_PACKED_B(pb2, pb3, 8);
|
||||
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+4, m+2);
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
LOAD_A_1x2(k+1, m);
|
||||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
LOAD_A_1x2(k+1, m);
|
||||
LOAD_PACKED_B(pb2, pb3, 8);
|
||||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
LOAD_A_1x1(k+1, m);
|
||||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
LOAD_A_1x1(k+1, m);
|
||||
LOAD_PACKED_B(pb2, pb3, 8);
|
||||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_PACKED_B(pb0, pb1, 0);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x1_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x1_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
for (m = 0; m < m8; m += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
LOAD_A_1x8(k+1, m);
|
||||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
LOAD_A_1x4(k+1, m);
|
||||
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_A_1x2(k+1, m);
|
||||
KERNEL_MMA_1ACC(pb1, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_A_1x1(k+1, m);
|
||||
KERNEL_MMA_1ACC(pb1, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x1_ACC(&acc0, n, m);
|
||||
}
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
for (m = 0; m < m8; m += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
LOAD_A_1x8(k+1, m);
|
||||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_2x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_2x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
LOAD_A_1x4(k+1, m);
|
||||
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_A_1x2(k+1, m);
|
||||
KERNEL_MMA_1ACC(pb1, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_A_1x1(k+1, m);
|
||||
KERNEL_MMA_1ACC(pb1, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x1_ACC(&acc0, n+0, m+0);
|
||||
}
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
for (m = 0; m < m8; m += 8) {
|
||||
vector double result = ((vector double){0.,0.});
|
||||
vector double result1 = ((vector double){0.,0.});
|
||||
vector double result2 = ((vector double){0.,0.});
|
||||
vector double result3 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_B_1x1(n, k);
|
||||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
SAVE_1x4_VSR(result2, n, m+4);
|
||||
SAVE_1x4_VSR(result3, n, m+6);
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
vector double result = ((vector double){0.,0.});
|
||||
vector double result1 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_B_1x1(n, k);
|
||||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
vector double result = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_B_1x1(n, k);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
FLOAT result = 0.0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
result += A[m+k*lda] * B[n*ldb+k];
|
||||
}
|
||||
result = result * alpha;
|
||||
|
||||
#if !defined(B0)
|
||||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
|
||||
#else
|
||||
C[n*ldc+m] = result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (has_packing) free(packB);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,581 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
|
||||
#if !__has_builtin(__builtin_vsx_assemble_pair)
|
||||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
|
||||
#endif
|
||||
|
||||
#if !defined(B0)
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
rc0 = vec_xl(0, C+((N)*ldc)+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result = vec_madd(result, valpha, rc0); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#define SAVE_4x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
|
||||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
|
||||
|
||||
#else
|
||||
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#define SAVE_4x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = result[0]; \
|
||||
C[(N+1)*ldc+M] = result[1];
|
||||
|
||||
#endif
|
||||
|
||||
#define INIT_8ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3); \
|
||||
__builtin_mma_xxsetaccz(&acc4); \
|
||||
__builtin_mma_xxsetaccz(&acc5); \
|
||||
__builtin_mma_xxsetaccz(&acc6); \
|
||||
__builtin_mma_xxsetaccz(&acc7);
|
||||
|
||||
#define INIT_4ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3);
|
||||
|
||||
#define INIT_2ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1);
|
||||
|
||||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
|
||||
|
||||
#define LOAD_A_1x8(K, M) \
|
||||
ra0 = vec_xl(0, A+(K*lda)+M+0); \
|
||||
ra1 = vec_xl(0, A+(K*lda)+M+2); \
|
||||
ra2 = vec_xl(0, A+(K*lda)+M+4); \
|
||||
ra3 = vec_xl(0, A+(K*lda)+M+6);
|
||||
|
||||
#define LOAD_A_1x4(K, M) \
|
||||
ra0 = vec_xl(0, A+(K*lda)+M+0); \
|
||||
ra1 = vec_xl(0, A+(K*lda)+M+2);
|
||||
|
||||
#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
|
||||
|
||||
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);
|
||||
|
||||
#define LOAD_BP_1x8(K, N) \
|
||||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
|
||||
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
|
||||
|
||||
#define LOAD_BP_1x4(K, N) \
|
||||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
|
||||
|
||||
#define LOAD_BP_1x2(K, N) \
|
||||
t0 = vec_xl(0, B+(K*ldb)+N); \
|
||||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
|
||||
|
||||
#define LOAD_B_1x8(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
|
||||
rb1 = vec_xl(0, B+(K*ldb)+N+2); \
|
||||
rb2 = vec_xl(0, B+(K*ldb)+N+4); \
|
||||
rb3 = vec_xl(0, B+(K*ldb)+N+6); \
|
||||
|
||||
#define LOAD_B_1x4(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
|
||||
rb1 = vec_xl(0, B+(K*ldb)+N+2);
|
||||
|
||||
#define LOAD_B_1x2(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0);
|
||||
|
||||
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
|
||||
|
||||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
|
||||
a0, a1, a2, a3, a4, a5, a6, a7) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
|
||||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
|
||||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
|
||||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
|
||||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
|
||||
|
||||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
|
||||
|
||||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
|
||||
|
||||
#define KERNEL_MMA_1ACC(b0, a0) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
|
||||
|
||||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1); \
|
||||
result2 = vec_madd(a2, b2, result2); \
|
||||
result3 = vec_madd(a3, b3, result3);
|
||||
|
||||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1);
|
||||
|
||||
#define KERNEL_VMADD_1VSR(a0, b0) \
|
||||
result = vec_madd(a0, b0, result);
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
BLASLONG m, n, k;
|
||||
|
||||
BLASLONG m8 = M & ~7;
|
||||
BLASLONG m4 = M & ~3;
|
||||
BLASLONG m2 = M & ~1;
|
||||
|
||||
BLASLONG n8 = N & ~7;
|
||||
BLASLONG n4 = N & ~3;
|
||||
BLASLONG n2 = N & ~1;
|
||||
|
||||
vector double valpha = vec_splats(alpha);
|
||||
#if !defined(B0)
|
||||
vector double vbeta = vec_splats(beta);
|
||||
#endif
|
||||
|
||||
for (m = 0; m < m8; m += 8) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
|
||||
INIT_8ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc4, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc6, n+0, m+6);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+4, m+2);
|
||||
SAVE_4x2_ACC(&acc5, n+4, m+4);
|
||||
SAVE_4x2_ACC(&acc7, n+4, m+6);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double t0;
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_2x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_2x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
register vector double result2 = ((vector double){0.,0.});
|
||||
register vector double result3 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
SAVE_1x4_VSR(result2, n, m+4);
|
||||
SAVE_1x4_VSR(result3, n, m+6);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+4, m+2);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double t0;
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0;
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double t0;
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
register vector double result2 = ((vector double){0.,0.});
|
||||
register vector double result3 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n, m);
|
||||
SAVE_4x1_VSR(result1, n+2, m);
|
||||
SAVE_4x1_VSR(result2, n+4, m);
|
||||
SAVE_4x1_VSR(result3, n+6, m);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n, m);
|
||||
SAVE_4x1_VSR(result1, n+2, m);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_B_1x2(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
FLOAT result = 0.0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
result += A[k*lda+m] * B[k*ldb+n];
|
||||
}
|
||||
result = result * alpha;
|
||||
|
||||
#if !defined(B0)
|
||||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
|
||||
#else
|
||||
C[n*ldc+m] = result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,882 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
|
||||
#if !__has_builtin(__builtin_vsx_assemble_pair)
|
||||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
|
||||
#endif
|
||||
|
||||
#if !defined(B0)
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_4x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_2x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
rc0 = vec_xl(0, C+((N)*ldc)+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result = vec_madd(result, valpha, rc0); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#else
|
||||
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_4x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_2x1_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#endif
|
||||
|
||||
#define INIT_8ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3); \
|
||||
__builtin_mma_xxsetaccz(&acc4); \
|
||||
__builtin_mma_xxsetaccz(&acc5); \
|
||||
__builtin_mma_xxsetaccz(&acc6); \
|
||||
__builtin_mma_xxsetaccz(&acc7);
|
||||
|
||||
#define INIT_4ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3);
|
||||
|
||||
#define INIT_2ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1);
|
||||
|
||||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
|
||||
|
||||
#if (defined(__GNUC__) && (__GNUC__ == 10))
|
||||
#if defined(_AIX)
|
||||
#define LOAD_PAIR(pair, v0, v1) \
|
||||
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
|
||||
#else
|
||||
#define LOAD_PAIR(pair, v0, v1) \
|
||||
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
|
||||
#endif
|
||||
#else
|
||||
#define LOAD_PAIR(pair, v0, v1) \
|
||||
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
|
||||
#endif
|
||||
|
||||
#define LOAD_AT_8x2(M, K) \
|
||||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
|
||||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra0, ra1); \
|
||||
t1 = vec_mergel(ra0, ra1); \
|
||||
ra0 = t0; \
|
||||
ra1 = t1; \
|
||||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
|
||||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra2, ra3); \
|
||||
t1 = vec_mergel(ra2, ra3); \
|
||||
ra2 = t0; \
|
||||
ra3 = t1; \
|
||||
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
|
||||
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra4, ra5); \
|
||||
t1 = vec_mergel(ra4, ra5); \
|
||||
ra4 = t0; \
|
||||
ra5 = t1; \
|
||||
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
|
||||
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra6, ra7); \
|
||||
t1 = vec_mergel(ra6, ra7); \
|
||||
ra6 = t0; \
|
||||
ra7 = t1;
|
||||
|
||||
#define LOAD_AT_8x1(M, K) \
|
||||
ra0 = vec_xor(ra0, ra0); \
|
||||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
|
||||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
|
||||
ra1 = vec_xor(ra1, ra1); \
|
||||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
|
||||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
|
||||
ra2 = vec_xor(ra2, ra2); \
|
||||
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
|
||||
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
|
||||
ra3 = vec_xor(ra3, ra3); \
|
||||
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
|
||||
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
|
||||
|
||||
#define LOAD_AT_4x2(M, K) \
|
||||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
|
||||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
|
||||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
|
||||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra0, ra1); \
|
||||
t1 = vec_mergeh(ra2, ra3); \
|
||||
t2 = vec_mergel(ra0, ra1); \
|
||||
t3 = vec_mergel(ra2, ra3); \
|
||||
ra0 = t0; \
|
||||
ra1 = t2; \
|
||||
ra2 = t1; \
|
||||
ra3 = t3;
|
||||
|
||||
#define LOAD_AT_4x1(M, K) \
|
||||
ra0 = vec_xor(ra0, ra0); \
|
||||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
|
||||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
|
||||
ra1 = vec_xor(ra1, ra1); \
|
||||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
|
||||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
|
||||
|
||||
#define LOAD_AT_2x2(M, K) \
|
||||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
|
||||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra0, ra1); \
|
||||
t1 = vec_mergel(ra0, ra1); \
|
||||
ra0 = t0; \
|
||||
ra1 = t1;
|
||||
|
||||
#define LOAD_AT_2x1(M, K) \
|
||||
ra0 = vec_xor(ra0, ra0); \
|
||||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
|
||||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
|
||||
|
||||
#define LOAD_A_1x1(K, M) \
|
||||
ra0 = vec_splats(A[((M+0)*lda)+K+0]);
|
||||
|
||||
#define LOAD_BTP_8x2(N, K) \
|
||||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
|
||||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
|
||||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
|
||||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb0, rb1); \
|
||||
t1 = vec_mergeh(rb2, rb3); \
|
||||
LOAD_PAIR(pb0, t0, t1); \
|
||||
t0 = vec_mergel(rb0, rb1); \
|
||||
t1 = vec_mergel(rb2, rb3); \
|
||||
LOAD_PAIR(pb2, t0, t1); \
|
||||
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
|
||||
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
|
||||
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
|
||||
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb4, rb5); \
|
||||
t1 = vec_mergeh(rb6, rb7); \
|
||||
LOAD_PAIR(pb1, t0, t1); \
|
||||
t0 = vec_mergel(rb4, rb5); \
|
||||
t1 = vec_mergel(rb6, rb7); \
|
||||
LOAD_PAIR(pb3, t0, t1);
|
||||
|
||||
#define LOAD_BTP_8x1(N, K) \
|
||||
rb0 = vec_xor(rb0, rb0); \
|
||||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
|
||||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
|
||||
rb1 = vec_xor(rb1, rb1); \
|
||||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
|
||||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
|
||||
LOAD_PAIR(pb0, rb0, rb1); \
|
||||
rb0 = vec_xor(rb0, rb0); \
|
||||
rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \
|
||||
rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \
|
||||
rb1 = vec_xor(rb1, rb1); \
|
||||
rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \
|
||||
rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \
|
||||
LOAD_PAIR(pb1, rb0, rb1);
|
||||
|
||||
#define LOAD_BTP_4x2(N, K) \
|
||||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
|
||||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
|
||||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
|
||||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb0, rb1); \
|
||||
t1 = vec_mergeh(rb2, rb3); \
|
||||
LOAD_PAIR(pb0, t0, t1); \
|
||||
t0 = vec_mergel(rb0, rb1); \
|
||||
t1 = vec_mergel(rb2, rb3); \
|
||||
LOAD_PAIR(pb1, t0, t1);
|
||||
|
||||
#define LOAD_BTP_4x1(N, K) \
|
||||
rb0 = vec_xor(rb0, rb0); \
|
||||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
|
||||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
|
||||
rb1 = vec_xor(rb1, rb1); \
|
||||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
|
||||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
|
||||
LOAD_PAIR(pb0, rb0, rb1);
|
||||
|
||||
#define LOAD_BTP_2x2(N, K) \
|
||||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
|
||||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
|
||||
t0 = vec_mergeh(rb0, rb1); \
|
||||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \
|
||||
t1 = vec_mergel(rb0, rb1); \
|
||||
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
|
||||
|
||||
#define LOAD_BTP_2x1(N, K) \
|
||||
rb0 = vec_xor(rb0, rb0); \
|
||||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
|
||||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
|
||||
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
|
||||
|
||||
#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]);
|
||||
|
||||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
|
||||
a0, a1, a2, a3, a4, a5, a6, a7) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
|
||||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
|
||||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
|
||||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
|
||||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
|
||||
|
||||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
|
||||
|
||||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
|
||||
|
||||
#define KERNEL_MMA_1ACC(b0, a0) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
|
||||
|
||||
#define KERNEL_MMA_1ACC_(acc, b0, a0) \
|
||||
__builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0);
|
||||
|
||||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1); \
|
||||
result2 = vec_madd(a2, b2, result2); \
|
||||
result3 = vec_madd(a3, b3, result3);
|
||||
|
||||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1);
|
||||
|
||||
#define KERNEL_VMADD_1VSR(a0, b0) \
|
||||
result = vec_madd(a0, b0, result);
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
BLASLONG m, n, k;
|
||||
|
||||
BLASLONG m8 = M & ~7;
|
||||
BLASLONG m4 = M & ~3;
|
||||
BLASLONG m2 = M & ~1;
|
||||
|
||||
BLASLONG n8 = N & ~7;
|
||||
BLASLONG n4 = N & ~3;
|
||||
BLASLONG n2 = N & ~1;
|
||||
|
||||
BLASLONG k2 = K & ~1;
|
||||
|
||||
vector double valpha = vec_splats(alpha);
|
||||
#if !defined(B0)
|
||||
vector double vbeta = vec_splats(beta);
|
||||
#endif
|
||||
|
||||
for (m = 0; m < m8; m += 8) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
|
||||
INIT_8ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_8x2(m, k);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1,
|
||||
ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6);
|
||||
KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3,
|
||||
ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7);
|
||||
}
|
||||
// workaround to avoid register spilling
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_1ACC_(acc0, pb0, ra0);
|
||||
KERNEL_MMA_1ACC_(acc1, pb0, ra1);
|
||||
LOAD_AT_4x1(m+4, k);
|
||||
KERNEL_MMA_1ACC_(acc2, pb0, ra0);
|
||||
KERNEL_MMA_1ACC_(acc3, pb0, ra1);
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BTP_4x1(n+4, k);
|
||||
KERNEL_MMA_1ACC_(acc4, pb0, ra0);
|
||||
KERNEL_MMA_1ACC_(acc5, pb0, ra1);
|
||||
LOAD_AT_4x1(m+4, k);
|
||||
KERNEL_MMA_1ACC_(acc6, pb0, ra0);
|
||||
KERNEL_MMA_1ACC_(acc7, pb0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc4, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc6, n+4, m+4);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc3, n+0, m+6);
|
||||
SAVE_4x2_ACC(&acc5, n+4, m+2);
|
||||
SAVE_4x2_ACC(&acc7, n+4, m+6);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_8x2(m, k);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
|
||||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_8x2(m, k);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
|
||||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_2x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
register vector double result2 = ((vector double){0.,0.});
|
||||
register vector double result3 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_B_1x1(n, k);
|
||||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
SAVE_1x4_VSR(result2, n, m+4);
|
||||
SAVE_1x4_VSR(result3, n, m+6);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_4x2(m, k);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2);
|
||||
KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc2, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+4, m+2);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_4x2(m, k);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
|
||||
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_4x2(m, k);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
|
||||
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_B_1x1(n, k);
|
||||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_2x2(m, k);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_2x1(m, k);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_2x2(m, k);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
KERNEL_MMA_1ACC(pb1, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_2x1(m, k);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_2x2(m, k);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
KERNEL_MMA_1ACC(pb1, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_2x1(m, k);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_B_1x1(n, k);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1, pb2, pb3;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_8x2(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
LOAD_A_1x1(k+1, m);
|
||||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_8x1(n, k);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x1_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x1_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_4x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_A_1x1(k+1, m);
|
||||
KERNEL_MMA_1ACC(pb1, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_4x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x1_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_2x2(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_A_1x1(k+1, m);
|
||||
KERNEL_MMA_1ACC(pb1, ra0);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_BTP_2x1(n, k);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x1_ACC(&acc0, n+0, m+0);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
FLOAT result = 0.0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
result += A[m*lda+k] * B[n*ldb+k];
|
||||
}
|
||||
result = result * alpha;
|
||||
|
||||
#if !defined(B0)
|
||||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
|
||||
#else
|
||||
C[n*ldc+m] = result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,829 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
|
||||
#if !__has_builtin(__builtin_vsx_assemble_pair)
|
||||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
|
||||
#endif
|
||||
|
||||
#if !defined(B0)
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
rc0 = vec_xl(0, C+((N)*ldc)+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result = vec_madd(result, valpha, rc0); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#define SAVE_4x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
|
||||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
|
||||
|
||||
#else
|
||||
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_2x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#define SAVE_4x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = result[0]; \
|
||||
C[(N+1)*ldc+M] = result[1];
|
||||
|
||||
#endif
|
||||
|
||||
#define INIT_8ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3); \
|
||||
__builtin_mma_xxsetaccz(&acc4); \
|
||||
__builtin_mma_xxsetaccz(&acc5); \
|
||||
__builtin_mma_xxsetaccz(&acc6); \
|
||||
__builtin_mma_xxsetaccz(&acc7);
|
||||
|
||||
#define INIT_4ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3);
|
||||
|
||||
#define INIT_2ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1);
|
||||
|
||||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
|
||||
|
||||
#define LOAD_AT_8x2(M, K) \
|
||||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
|
||||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
|
||||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
|
||||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra0, ra1); \
|
||||
t1 = vec_mergeh(ra2, ra3); \
|
||||
t2 = vec_mergel(ra0, ra1); \
|
||||
t3 = vec_mergel(ra2, ra3); \
|
||||
ra0 = t0; \
|
||||
ra1 = t2; \
|
||||
ra2 = t1; \
|
||||
ra3 = t3; \
|
||||
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
|
||||
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
|
||||
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
|
||||
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra4, ra5); \
|
||||
t1 = vec_mergeh(ra6, ra7); \
|
||||
t2 = vec_mergel(ra4, ra5); \
|
||||
t3 = vec_mergel(ra6, ra7); \
|
||||
ra4 = t0; \
|
||||
ra5 = t2; \
|
||||
ra6 = t1; \
|
||||
ra7 = t3;
|
||||
|
||||
#define LOAD_AT_8x1(M, K) \
|
||||
ra0 = vec_xor(ra0, ra0); \
|
||||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
|
||||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
|
||||
ra1 = vec_xor(ra1, ra1); \
|
||||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
|
||||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
|
||||
ra2 = vec_xor(ra2, ra2); \
|
||||
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
|
||||
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
|
||||
ra3 = vec_xor(ra3, ra3); \
|
||||
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
|
||||
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
|
||||
|
||||
#define LOAD_AT_4x2(M, K) \
|
||||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
|
||||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
|
||||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
|
||||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra0, ra1); \
|
||||
t1 = vec_mergeh(ra2, ra3); \
|
||||
t2 = vec_mergel(ra0, ra1); \
|
||||
t3 = vec_mergel(ra2, ra3); \
|
||||
ra0 = t0; \
|
||||
ra1 = t2; \
|
||||
ra2 = t1; \
|
||||
ra3 = t3;
|
||||
|
||||
#define LOAD_AT_4x1(M, K) \
|
||||
ra0 = vec_xor(ra0, ra0); \
|
||||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
|
||||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
|
||||
ra1 = vec_xor(ra1, ra1); \
|
||||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
|
||||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
|
||||
|
||||
#define LOAD_AT_2x2(M, K) \
|
||||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
|
||||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
|
||||
t0 = vec_mergeh(ra0, ra1); \
|
||||
t1 = vec_mergel(ra0, ra1); \
|
||||
ra0 = t0; \
|
||||
ra1 = t1;
|
||||
|
||||
#define LOAD_AT_2x1(M, K) \
|
||||
ra0 = vec_xor(ra0, ra0); \
|
||||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
|
||||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
|
||||
|
||||
#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
|
||||
|
||||
#define LOAD_BP_1x8(K, N) \
|
||||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \
|
||||
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
|
||||
|
||||
#define LOAD_BP_1x4(K, N) \
|
||||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
|
||||
|
||||
#define LOAD_BP_1x2(K, N) \
|
||||
t0 = vec_xl(0, B+((K)*ldb)+N); \
|
||||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
|
||||
|
||||
#define LOAD_B_1x8(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
|
||||
rb1 = vec_xl(0, B+(K*ldb)+N+2); \
|
||||
rb2 = vec_xl(0, B+(K*ldb)+N+4); \
|
||||
rb3 = vec_xl(0, B+(K*ldb)+N+6); \
|
||||
|
||||
#define LOAD_B_1x4(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
|
||||
rb1 = vec_xl(0, B+(K*ldb)+N+2);
|
||||
|
||||
#define LOAD_B_1x2(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0);
|
||||
|
||||
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]);
|
||||
|
||||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
|
||||
a0, a1, a2, a3, a4, a5, a6, a7) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \
|
||||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \
|
||||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \
|
||||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \
|
||||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
|
||||
|
||||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
|
||||
|
||||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
|
||||
|
||||
#define KERNEL_MMA_1ACC(b0, a0) \
|
||||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
|
||||
|
||||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1); \
|
||||
result2 = vec_madd(a2, b2, result2); \
|
||||
result3 = vec_madd(a3, b3, result3);
|
||||
|
||||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1);
|
||||
|
||||
#define KERNEL_VMADD_1VSR(a0, b0) \
|
||||
result = vec_madd(a0, b0, result);
|
||||
|
||||
#define PACK_A(ra0, ra1, ra2, ra3, offset) \
|
||||
vec_xst(ra0, 0, packA+(k*8)+0+offset); \
|
||||
vec_xst(ra1, 0, packA+(k*8)+2+offset); \
|
||||
vec_xst(ra2, 0, packA+(k*8)+4+offset); \
|
||||
vec_xst(ra3, 0, packA+(k*8)+6+offset);
|
||||
|
||||
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
|
||||
ra0 = vec_xl(0, packA+(k*8)+0+offset); \
|
||||
ra1 = vec_xl(0, packA+(k*8)+2+offset); \
|
||||
ra2 = vec_xl(0, packA+(k*8)+4+offset); \
|
||||
ra3 = vec_xl(0, packA+(k*8)+6+offset);
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
BLASLONG m, n, k;
|
||||
|
||||
BLASLONG m8 = M & ~7;
|
||||
BLASLONG m4 = M & ~3;
|
||||
BLASLONG m2 = M & ~1;
|
||||
|
||||
BLASLONG n8 = N & ~7;
|
||||
BLASLONG n4 = N & ~3;
|
||||
BLASLONG n2 = N & ~1;
|
||||
|
||||
BLASLONG k2 = K & ~1;
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
|
||||
#else
|
||||
int has_packing = 0;
|
||||
#endif
|
||||
|
||||
double *packA;
|
||||
if (has_packing) packA = (double *)malloc(K*8*sizeof(double));
|
||||
|
||||
vector double valpha = vec_splats(alpha);
|
||||
#if !defined(B0)
|
||||
vector double vbeta = vec_splats(beta);
|
||||
#endif
|
||||
|
||||
for (m = 0; m < m8; m += 8) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
|
||||
INIT_8ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
if (has_packing) {
|
||||
if (n == 0) {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_8x2(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
|
||||
PACK_A(ra0, ra2, ra4, ra6, 0);
|
||||
LOAD_BP_1x8(k+1, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
|
||||
PACK_A(ra1, ra3, ra5, ra7, 8);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
PACK_A(ra0, ra1, ra2, ra3, 0);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
|
||||
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
|
||||
LOAD_BP_1x8(k+1, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_8x2(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
|
||||
LOAD_BP_1x8(k+1, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc4, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc6, n+0, m+6);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+4, m+2);
|
||||
SAVE_4x2_ACC(&acc5, n+4, m+4);
|
||||
SAVE_4x2_ACC(&acc7, n+4, m+6);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_8x2(m, k);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
|
||||
LOAD_BP_1x4(k+1, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
|
||||
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
|
||||
LOAD_BP_1x4(k+1, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_4x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_8x2(m, k);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
|
||||
LOAD_BP_1x2(k+1, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
|
||||
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
|
||||
LOAD_BP_1x2(k+1, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
SAVE_2x2_ACC(&acc2, n+0, m+4);
|
||||
SAVE_2x2_ACC(&acc3, n+0, m+6);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
register vector double result2 = ((vector double){0.,0.});
|
||||
register vector double result3 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double rb0;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_AT_8x1(m, k);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
SAVE_1x4_VSR(result2, n, m+4);
|
||||
SAVE_1x4_VSR(result3, n, m+6);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_4x2(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2);
|
||||
LOAD_BP_1x8(k+1, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+0, m+2);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+4, m+2);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_4x2(m, k);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
|
||||
LOAD_BP_1x4(k+1, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1, ra2, ra3;
|
||||
register vector double t0, t1, t2, t3;
|
||||
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_4x2(m, k);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
|
||||
LOAD_BP_1x2(k+1, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_2x2_ACC(&acc1, n+0, m+2);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_AT_4x1(m, k);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
SAVE_1x4_VSR(result1, n, m+2);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0, pb1;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_2x2(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
LOAD_BP_1x8(k+1, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_2x1(m, k);
|
||||
LOAD_BP_1x8(k, n);
|
||||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_2x2(m, k);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_BP_1x4(k+1, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_2x1(m, k);
|
||||
LOAD_BP_1x4(k, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_4x2_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector double ra0, ra1;
|
||||
register vector double t0, t1;
|
||||
|
||||
__vector_pair pb0;
|
||||
|
||||
for (k = 0; k < k2; k += 2) {
|
||||
LOAD_AT_2x2(m, k);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
LOAD_BP_1x2(k+1, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra1);
|
||||
}
|
||||
for (; k < K; k++) {
|
||||
LOAD_AT_2x1(m, k);
|
||||
LOAD_BP_1x2(k, n);
|
||||
KERNEL_MMA_1ACC(pb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
vector double result[4];
|
||||
SAVE_2x2_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_AT_2x1(m, k);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector double rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m+0);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
register vector double result2 = ((vector double){0.,0.});
|
||||
register vector double result3 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1, rb2, rb3;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(m, k);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n, m);
|
||||
SAVE_4x1_VSR(result1, n+2, m);
|
||||
SAVE_4x1_VSR(result2, n+4, m);
|
||||
SAVE_4x1_VSR(result3, n+6, m);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
register vector double result1 = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0, rb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(m, k);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n, m);
|
||||
SAVE_4x1_VSR(result1, n+2, m);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
register vector double result = ((vector double){0.,0.});
|
||||
|
||||
register vector double ra0;
|
||||
register vector double rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(m, k);
|
||||
LOAD_B_1x2(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
FLOAT result = 0.0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
result += A[m*lda+k] * B[k*ldb+n];
|
||||
}
|
||||
result = result * alpha;
|
||||
|
||||
#if !defined(B0)
|
||||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
|
||||
#else
|
||||
C[n*ldc+m] = result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if(has_packing) free(packA);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
|||
XXSPLTD_S(32,%x9,0) // alpha, alpha
|
||||
|
||||
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha
|
||||
"xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha
|
||||
#else
|
||||
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
|
||||
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
|
||||
#endif
|
||||
|
||||
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
||||
"add %6, %6, %6 \n\t" // 2 * lda
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
|
||||
#else
|
||||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
|
||||
|
||||
#endif
|
||||
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
|
||||
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
|
||||
|
||||
|
|
@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
|
||||
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
|
||||
"add %10, %10, %10 \n\t" // 2 * lda
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
|
||||
XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha
|
||||
XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha
|
||||
XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha
|
||||
XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha
|
||||
#else
|
||||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
|
||||
|
|
@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
|
||||
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
|
||||
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
|
||||
#endif
|
||||
|
||||
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
|
||||
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
|
||||
|
|
@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
"one%=: \n\t"
|
||||
|
||||
"lxvp 36, 0( %2) \n\t" // y0, y1
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 40, 34 \n\t"
|
||||
"xvmaddadp 37, 41, 34 \n\t"
|
||||
#endif
|
||||
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 42, 35 \n\t"
|
||||
"xvmaddadp 37, 43, 35 \n\t"
|
||||
#endif
|
||||
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 44, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 32 \n\t"
|
||||
#endif
|
||||
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 46, 33 \n\t"
|
||||
"xvmaddadp 37, 47, 33 \n\t"
|
||||
#endif
|
||||
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 50, 38 \n\t"
|
||||
"xvmaddadp 37, 51, 38 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 50, 48 \n\t"
|
||||
"xvmaddadp 37, 51, 48 \n\t"
|
||||
#endif
|
||||
"lxvpx 50, %7, %11 \n\t" // a4[0]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 52, 39 \n\t"
|
||||
"xvmaddadp 37, 53, 39 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 52, 49 \n\t"
|
||||
"xvmaddadp 37, 53, 49 \n\t"
|
||||
#endif
|
||||
"lxvpx 52, %8, %11 \n\t" // a5[0]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 54, 48 \n\t"
|
||||
"xvmaddadp 37, 55, 48 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 54, 38 \n\t"
|
||||
"xvmaddadp 37, 55, 38 \n\t"
|
||||
#endif
|
||||
"lxvpx 54, %9, %11 \n\t" // a6[0]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 56, 49 \n\t"
|
||||
"xvmaddadp 37, 57, 49 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 56, 39 \n\t"
|
||||
"xvmaddadp 37, 57, 39 \n\t"
|
||||
#endif
|
||||
"lxvpx 56, %10, %11 \n\t" // a7[0]
|
||||
"addi %11, %11, 32 \n\t"
|
||||
|
||||
|
|
@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
"two%=: \n\t"
|
||||
|
||||
"lxvp 36, 0( %2) \n\t" // y0, y1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
"xvmaddadp 36, 50, 38 \n\t"
|
||||
"xvmaddadp 37, 51, 38 \n\t"
|
||||
"xvmaddadp 36, 52, 39 \n\t"
|
||||
"xvmaddadp 37, 53, 39 \n\t"
|
||||
"xvmaddadp 36, 54, 48 \n\t"
|
||||
"xvmaddadp 37, 55, 48 \n\t"
|
||||
"xvmaddadp 36, 56, 49 \n\t"
|
||||
"xvmaddadp 37, 57, 49 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 40, 34 \n\t"
|
||||
"xvmaddadp 37, 41, 34 \n\t"
|
||||
"xvmaddadp 36, 42, 35 \n\t"
|
||||
|
|
@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
"xvmaddadp 37, 55, 38 \n\t"
|
||||
"xvmaddadp 36, 56, 39 \n\t"
|
||||
"xvmaddadp 37, 57, 39 \n\t"
|
||||
#endif
|
||||
"stxvp 36, 0( %2) \n\t" // y0, y1
|
||||
|
||||
:
|
||||
|
|
|
|||
|
|
@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
|||
"lxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(42,34,35)
|
||||
XXMRGLD_S(43,34,35)
|
||||
|
||||
XXMRGHD_S(44,4,5)
|
||||
XXMRGLD_S(45,4,5)
|
||||
#else
|
||||
XXMRGLD_S(42,35,34)
|
||||
XXMRGHD_S(43,35,34)
|
||||
|
||||
XXMRGLD_S(44,5,4)
|
||||
XXMRGHD_S(45,5,4)
|
||||
#endif
|
||||
|
||||
"xvadddp 42,42,43 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(46,6,7)
|
||||
XXMRGLD_S(47,6,7)
|
||||
#else
|
||||
XXMRGLD_S(46,7,6)
|
||||
XXMRGHD_S(47,7,6)
|
||||
|
||||
#endif
|
||||
"xvadddp 44,44,45 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(48,8,9)
|
||||
XXMRGLD_S(49,8,9)
|
||||
#else
|
||||
XXMRGLD_S(48,9,8)
|
||||
XXMRGHD_S(49,9,8)
|
||||
|
||||
#endif
|
||||
"xvadddp 46,46,47 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 38,42,36 \n\t"
|
||||
"xvmaddadp 39,44,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 39,42,36 \n\t"
|
||||
"xvmaddadp 38,44,36 \n\t"
|
||||
|
||||
#endif
|
||||
"xvadddp 48,48,49 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 41,48,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 41,46,36 \n\t"
|
||||
|
||||
#endif
|
||||
"stxvp 38, 0(%[y]) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 40,46,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 40,48,36 \n\t"
|
||||
#endif
|
||||
"stxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
: [memy] "+m" (*(double (*)[8])y),
|
||||
|
|
|
|||
|
|
@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "drot_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "drot_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "drot_microk_power8.c"
|
||||
#include "drot_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
|
|||
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dscal_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "dscal_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dscal_microk_power8.c"
|
||||
#include "dscal_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
|
|||
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dswap_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "swap_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dswap_microk_power8.c"
|
||||
#include "swap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,84 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
|
||||
{
|
||||
double MNK = (double) M * (double) N * (double) K;
|
||||
|
||||
#if defined(DOUBLE) // dgemm
|
||||
|
||||
// gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This
|
||||
// issue affects both dgemm_nn and dgemm_tn.
|
||||
#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))
|
||||
if (!transb)
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
if (MNK <= 54.0*54.0*54.0)
|
||||
return 1;
|
||||
|
||||
#else // sgemm
|
||||
|
||||
#if defined(__GNUC__) && defined(__clang__)
|
||||
// clang generates code with register spilling for the region of code with
|
||||
// packing, thus, we had to disable this optimization for clang. Given that
|
||||
// the packing on-demand used in this work is one of the reasons that lead the
|
||||
// small kernels to outperform the normal flow (when MNK increases), with it
|
||||
// disabled we had to reduce the MNK inputs used by the code generated by clang.
|
||||
if (MNK > 84.0*84.0*84.0)
|
||||
return 0;
|
||||
|
||||
if (transa && !transb) {
|
||||
// sgemm_tn works better when packing on-demand is used
|
||||
if (MNK <= 64.0*64.0*64.0 && K >= 4)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else // gcc
|
||||
|
||||
if (MNK > 100.0*100.0*100.0)
|
||||
return 0;
|
||||
|
||||
#endif
|
||||
|
||||
// Multi-threading execution outperforms (or approaches) the execution of the
|
||||
// small kernel.
|
||||
if (num_cpu_avail(3) > 1) {
|
||||
if (MNK <= 64.0*64.0*64.0)
|
||||
return 1;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sasum_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "sasum_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sasum_microk_power8.c"
|
||||
#include "sasum_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,887 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
|
||||
#if !defined(B0)
|
||||
#define SAVE_4x4_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[2] = vec_madd(result[2], valpha, rc0); \
|
||||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
|
||||
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[3] = vec_madd(result[3], valpha, rc0); \
|
||||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
|
||||
|
||||
#define SAVE_2x4_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[0] = vec_madd(result[0], valpha, rc0); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result[1] = vec_madd(result[1], valpha, rc0); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
rc0 = vec_xl(0, C+((N)*ldc)+M); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result = vec_madd(result, valpha, rc0); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#define SAVE_2x2_VSR(result, N, M) \
|
||||
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
|
||||
rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
|
||||
rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result = vec_madd(result, valpha, rc0); \
|
||||
vec_xst_len(result, C+(N*ldc)+M, 8); \
|
||||
C[(N+1)*ldc+M+0] = result[2]; \
|
||||
C[(N+1)*ldc+M+1] = result[3];
|
||||
|
||||
#define SAVE_1x2_VSR(result, N, M) \
|
||||
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
|
||||
rc0 = vec_mul(rc0, vbeta); \
|
||||
result = vec_madd(result, valpha, rc0); \
|
||||
vec_xst_len(result, C+(N*ldc)+M, 8);
|
||||
|
||||
#define SAVE_4x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
|
||||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
|
||||
C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
|
||||
C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
|
||||
|
||||
#define SAVE_2x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
|
||||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
|
||||
|
||||
#else
|
||||
|
||||
#define SAVE_4x4_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst(result[3], 0, C+(N+3)*ldc+M);
|
||||
|
||||
#define SAVE_4x2_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
|
||||
result[2] = vec_mul(result[2], valpha); \
|
||||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
|
||||
result[3] = vec_mul(result[3], valpha); \
|
||||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
|
||||
|
||||
#define SAVE_2x4_ACC(ACC, N, M) \
|
||||
__builtin_mma_disassemble_acc((void *)result, ACC); \
|
||||
result[0] = vec_mul(result[0], valpha); \
|
||||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \
|
||||
result[1] = vec_mul(result[1], valpha); \
|
||||
vec_xst(result[1], 0, C+(N+1)*ldc+M);
|
||||
|
||||
#define SAVE_1x4_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
vec_xst(result, 0, C+((N)*ldc)+M);
|
||||
|
||||
#define SAVE_2x2_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
vec_xst_len(result, C+(N*ldc)+M, 8); \
|
||||
C[(N+1)*ldc+M+0] = result[2]; \
|
||||
C[(N+1)*ldc+M+1] = result[3];
|
||||
|
||||
#define SAVE_1x2_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
vec_xst_len(result, C+(N*ldc)+M, 8);
|
||||
|
||||
#define SAVE_4x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = result[0]; \
|
||||
C[(N+1)*ldc+M] = result[1]; \
|
||||
C[(N+2)*ldc+M] = result[2]; \
|
||||
C[(N+3)*ldc+M] = result[3];
|
||||
|
||||
#define SAVE_2x1_VSR(result, N, M) \
|
||||
result = vec_mul(result, valpha); \
|
||||
C[(N+0)*ldc+M] = result[0]; \
|
||||
C[(N+1)*ldc+M] = result[1];
|
||||
|
||||
#endif
|
||||
|
||||
#define INIT_8ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3); \
|
||||
__builtin_mma_xxsetaccz(&acc4); \
|
||||
__builtin_mma_xxsetaccz(&acc5); \
|
||||
__builtin_mma_xxsetaccz(&acc6); \
|
||||
__builtin_mma_xxsetaccz(&acc7);
|
||||
|
||||
#define INIT_4ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1); \
|
||||
__builtin_mma_xxsetaccz(&acc2); \
|
||||
__builtin_mma_xxsetaccz(&acc3);
|
||||
|
||||
#define INIT_2ACCS() \
|
||||
__builtin_mma_xxsetaccz(&acc0); \
|
||||
__builtin_mma_xxsetaccz(&acc1);
|
||||
|
||||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
|
||||
|
||||
#define LOAD_A_1x16(K, M) \
|
||||
ra0 = vec_xl(0, A+(K*lda)+M+0); \
|
||||
ra1 = vec_xl(0, A+(K*lda)+M+4); \
|
||||
ra2 = vec_xl(0, A+(K*lda)+M+8); \
|
||||
ra3 = vec_xl(0, A+(K*lda)+M+12);
|
||||
|
||||
#define LOAD_A_1x8(K, M) \
|
||||
ra0 = vec_xl(0, A+(K*lda)+M+0); \
|
||||
ra1 = vec_xl(0, A+(K*lda)+M+4);
|
||||
|
||||
#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
|
||||
|
||||
#define LOAD_A_2x2(K, M) \
|
||||
ra0 = vec_splats(A[K*lda+M+0]); \
|
||||
ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \
|
||||
ra0 = vec_insert(A[K*lda+M+1], ra0, 3);
|
||||
|
||||
#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8);
|
||||
|
||||
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]);
|
||||
|
||||
#define LOAD_B_1x16(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
|
||||
rb1 = vec_xl(0, B+(K*ldb)+N+4); \
|
||||
rb2 = vec_xl(0, B+(K*ldb)+N+8); \
|
||||
rb3 = vec_xl(0, B+(K*ldb)+N+12);
|
||||
|
||||
#define LOAD_B_1x8(K, N) \
|
||||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \
|
||||
rb1 = vec_xl(0, B+(K*ldb)+N+4);
|
||||
|
||||
#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N);
|
||||
|
||||
#define LOAD_B_2x2(K, N) \
|
||||
rb0 = vec_splats(B[K*ldb+N]); \
|
||||
rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \
|
||||
rb0 = vec_insert(B[K*ldb+N+1], rb0, 3);
|
||||
|
||||
#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8);
|
||||
|
||||
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
|
||||
|
||||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
|
||||
a0, a1, a2, a3, a4, a5, a6, a7) \
|
||||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \
|
||||
__builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \
|
||||
__builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \
|
||||
__builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \
|
||||
__builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
|
||||
|
||||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
|
||||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
|
||||
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
|
||||
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
|
||||
|
||||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
|
||||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
|
||||
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
|
||||
|
||||
#define KERNEL_MMA_1ACC(b0, a0) \
|
||||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
|
||||
|
||||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1); \
|
||||
result2 = vec_madd(a2, b2, result2); \
|
||||
result3 = vec_madd(a3, b3, result3);
|
||||
|
||||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
|
||||
result = vec_madd(a0, b0, result); \
|
||||
result1 = vec_madd(a1, b1, result1);
|
||||
|
||||
#define KERNEL_VMADD_1VSR(a0, b0) \
|
||||
result = vec_madd(a0, b0, result);
|
||||
|
||||
#define PACK_A(ra0, ra1, ra2, ra3, offset) \
|
||||
vec_xst(ra0, 0, packA+(k*16)+0+offset); \
|
||||
vec_xst(ra1, 0, packA+(k*16)+4+offset); \
|
||||
vec_xst(ra2, 0, packA+(k*16)+8+offset); \
|
||||
vec_xst(ra3, 0, packA+(k*16)+12+offset);
|
||||
|
||||
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
|
||||
ra0 = vec_xl(0, packA+(k*16)+0+offset); \
|
||||
ra1 = vec_xl(0, packA+(k*16)+4+offset); \
|
||||
ra2 = vec_xl(0, packA+(k*16)+8+offset); \
|
||||
ra3 = vec_xl(0, packA+(k*16)+12+offset);
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
BLASLONG m, n, k;
|
||||
|
||||
BLASLONG m16 = M & ~15;
|
||||
BLASLONG m8 = M & ~7;
|
||||
BLASLONG m4 = M & ~3;
|
||||
BLASLONG m2 = M & ~1;
|
||||
|
||||
BLASLONG n16 = N & ~15;
|
||||
BLASLONG n8 = N & ~7;
|
||||
BLASLONG n4 = N & ~3;
|
||||
BLASLONG n2 = N & ~1;
|
||||
|
||||
vector float valpha = vec_splats(alpha);
|
||||
#if !defined(B0)
|
||||
vector float vbeta = vec_splats(beta);
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0;
|
||||
#else
|
||||
int has_packing = 0;
|
||||
#endif
|
||||
|
||||
float *packA;
|
||||
if (has_packing) packA = (float *)malloc(K*16*sizeof(float));
|
||||
|
||||
for (m = 0; m < m16; m += 16) {
|
||||
for (n = 0; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
|
||||
INIT_8ACCS();
|
||||
|
||||
register vector float ra0, ra1, ra2, ra3;
|
||||
register vector float rb0, rb1;
|
||||
|
||||
if (has_packing) {
|
||||
if (n == 0) {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x16(k, m);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
PACK_A(ra0, ra1, ra2, ra3, 0);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x16(k, m);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
|
||||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x4_ACC(&acc2, n+0, m+4);
|
||||
SAVE_4x4_ACC(&acc4, n+0, m+8);
|
||||
SAVE_4x4_ACC(&acc6, n+0, m+12);
|
||||
SAVE_4x4_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x4_ACC(&acc3, n+4, m+4);
|
||||
SAVE_4x4_ACC(&acc5, n+4, m+8);
|
||||
SAVE_4x4_ACC(&acc7, n+4, m+12);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector float ra0, ra1, ra2, ra3;
|
||||
register vector float rb0;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x16(k, m);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x4_ACC(&acc1, n+0, m+4);
|
||||
SAVE_4x4_ACC(&acc2, n+0, m+8);
|
||||
SAVE_4x4_ACC(&acc3, n+0, m+12);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector float ra0, ra1, ra2, ra3;
|
||||
register vector float rb0;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x16(k, m);
|
||||
LOAD_B_1x2(k, n);
|
||||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_B_1x2(k, n);
|
||||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_2x4_ACC(&acc0, n, m+0);
|
||||
SAVE_2x4_ACC(&acc1, n, m+4);
|
||||
SAVE_2x4_ACC(&acc2, n, m+8);
|
||||
SAVE_2x4_ACC(&acc3, n, m+12);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
vector float result = ((vector float){0., 0., 0., 0.});
|
||||
vector float result1 = ((vector float){0., 0., 0., 0.});
|
||||
vector float result2 = ((vector float){0., 0., 0., 0.});
|
||||
vector float result3 = ((vector float){0., 0., 0., 0.});
|
||||
|
||||
register vector float ra0, ra1, ra2, ra3;
|
||||
register vector float rb0;
|
||||
|
||||
if (!has_packing) {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x16(k, m);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
|
||||
}
|
||||
} else {
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m);
|
||||
SAVE_1x4_VSR(result1, n, m+4);
|
||||
SAVE_1x4_VSR(result2, n, m+8);
|
||||
SAVE_1x4_VSR(result3, n, m+12);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m8; m += 8) {
|
||||
for (n = 0; n < n16; n += 16) {
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
|
||||
INIT_8ACCS();
|
||||
|
||||
register vector float ra0, ra1;
|
||||
register vector float rb0, rb1, rb2, rb3;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_B_1x16(k, n);
|
||||
KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
|
||||
ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x4_ACC(&acc4, n+0, m+4);
|
||||
SAVE_4x4_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x4_ACC(&acc5, n+4, m+4);
|
||||
SAVE_4x4_ACC(&acc2, n+8, m+0);
|
||||
SAVE_4x4_ACC(&acc6, n+8, m+4);
|
||||
SAVE_4x4_ACC(&acc3, n+12, m+0);
|
||||
SAVE_4x4_ACC(&acc7, n+12, m+4);
|
||||
}
|
||||
|
||||
for (; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector float ra0, ra1;
|
||||
register vector float rb0, rb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x4_ACC(&acc2, n+0, m+4);
|
||||
SAVE_4x4_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x4_ACC(&acc3, n+4, m+4);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector float ra0, ra1;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x4_ACC(&acc1, n+0, m+4);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector float ra0, ra1;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_B_1x2(k, n);
|
||||
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_2x4_ACC(&acc0, n, m+0);
|
||||
SAVE_2x4_ACC(&acc1, n, m+4);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
vector float result1 = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0, ra1;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x8(k, m);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m);
|
||||
SAVE_1x4_VSR(result1, n, m+4);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m4; m += 4) {
|
||||
for (n = 0; n < n16; n += 16) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0, rb1, rb2, rb3;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_B_1x16(k, n);
|
||||
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x4_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x4_ACC(&acc2, n+8, m+0);
|
||||
SAVE_4x4_ACC(&acc3, n+12, m+0);
|
||||
}
|
||||
|
||||
for (; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0, rb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x4_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_MMA_1ACC(rb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x4_ACC(&acc0, n+0, m+0);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_B_1x2(k, n);
|
||||
KERNEL_MMA_1ACC(rb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_2x4_ACC(&acc0, n, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x4(k, m);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
SAVE_1x4_VSR(result, n, m);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < m2; m += 2) {
|
||||
for (n = 0; n < n16; n += 16) {
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
|
||||
INIT_4ACCS();
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0, rb1, rb2, rb3;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_B_1x16(k, n);
|
||||
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
SAVE_4x2_ACC(&acc2, n+8, m+0);
|
||||
SAVE_4x2_ACC(&acc3, n+12, m+0);
|
||||
}
|
||||
|
||||
for (; n < n8; n += 8) {
|
||||
__vector_quad acc0, acc1;
|
||||
|
||||
INIT_2ACCS();
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0, rb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
SAVE_4x2_ACC(&acc1, n+4, m+0);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
__vector_quad acc0;
|
||||
|
||||
INIT_1ACC();
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_MMA_1ACC(rb0, ra0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
vector float result[4];
|
||||
SAVE_4x2_ACC(&acc0, n+0, m+0);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_2x2(k, m);
|
||||
LOAD_B_2x2(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
SAVE_2x2_VSR(result, n, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x2(k, m);
|
||||
LOAD_B_1x1(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
#if !defined(B0)
|
||||
register vector float rc0;
|
||||
#endif
|
||||
SAVE_1x2_VSR(result, n, m);
|
||||
}
|
||||
}
|
||||
|
||||
for (; m < M; m++) {
|
||||
for (n = 0; n < n16; n += 16) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
vector float result1 = ((vector float){0.,0.,0.,0.});
|
||||
vector float result2 = ((vector float){0.,0.,0.,0.});
|
||||
vector float result3 = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0, rb1, rb2, rb3;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_B_1x16(k, n);
|
||||
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n+0, m);
|
||||
SAVE_4x1_VSR(result1, n+4, m);
|
||||
SAVE_4x1_VSR(result2, n+8, m);
|
||||
SAVE_4x1_VSR(result3, n+12, m);
|
||||
}
|
||||
|
||||
for (; n < n8; n += 8) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
vector float result1 = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0, rb1;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_B_1x8(k, n);
|
||||
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n+0, m);
|
||||
SAVE_4x1_VSR(result1, n+4, m);
|
||||
}
|
||||
|
||||
for (; n < n4; n += 4) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_B_1x4(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
SAVE_4x1_VSR(result, n+0, m);
|
||||
}
|
||||
|
||||
for (; n < n2; n += 2) {
|
||||
vector float result = ((vector float){0.,0.,0.,0.});
|
||||
|
||||
register vector float ra0;
|
||||
register vector float rb0;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
LOAD_A_1x1(k, m);
|
||||
LOAD_B_1x2(k, n);
|
||||
KERNEL_VMADD_1VSR(ra0, rb0);
|
||||
}
|
||||
|
||||
SAVE_2x1_VSR(result, n+0, m);
|
||||
}
|
||||
|
||||
for (; n < N; n++) {
|
||||
FLOAT result = 0.0f;
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
result += A[k*lda+m] * B[k*ldb+n];
|
||||
}
|
||||
result = result * alpha;
|
||||
|
||||
#if !defined(B0)
|
||||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
|
||||
#else
|
||||
C[n*ldc+m] = result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (has_packing) free (packA);
|
||||
|
||||
return 0;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "srot_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "srot_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "srot_microk_power8.c"
|
||||
#include "srot_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
|||
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sscal_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "sscal_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sscal_microk_power8.c"
|
||||
#include "sscal_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
|||
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sswap_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "swap_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sswap_microk_power8.c"
|
||||
#include "swap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 64 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
|||
|
|
@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
|||
double alpha_r, double alpha_i)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
static const double mvec[2] = { -1.0, 1.0 };
|
||||
#else
|
||||
static const double mvec[2] = { 1.0, -1.0 };
|
||||
#endif
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
static const double mvec[2] = { 1.0, -1.0 };
|
||||
#else
|
||||
static const double mvec[2] = { -1.0, 1.0 };
|
||||
#endif
|
||||
#endif
|
||||
const double *mvecp = mvec;
|
||||
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue