diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f2046a3d..c1d69da13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,10 +3,13 @@ ## cmake_minimum_required(VERSION 2.8.5) + project(OpenBLAS C ASM) + set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_PATCH_VERSION 19) + set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -20,51 +23,68 @@ endif() ####### if(MSVC) -option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) + option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() + option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) + option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) + option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) + option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) + option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") -option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) + option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() -set(NO_AFFINITY 1) + set(NO_AFFINITY 1) endif() + option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) + option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) +option(BUILD_STATIC_LIBS "Build static library" OFF) +if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) + set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) +endif() +if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC) + message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS") + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE) +endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using # 64 bit integer interfaces in OpenBLAS. - set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) + set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) + ####### if(BUILD_WITHOUT_LAPACK) -set(NO_LAPACK 1) -set(NO_LAPACKE 1) + set(NO_LAPACK 1) + set(NO_LAPACKE 1) endif() if(BUILD_WITHOUT_CBLAS) -set(NO_CBLAS 1) + set(NO_CBLAS 1) endif() ####### if(MSVC AND MSVC_STATIC_CRT) - set(CompilerFlags - CMAKE_CXX_FLAGS - CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_RELEASE - CMAKE_C_FLAGS - CMAKE_C_FLAGS_DEBUG - CMAKE_C_FLAGS_RELEASE - ) - foreach(CompilerFlag ${CompilerFlags}) - string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") - endforeach() + set(CompilerFlags + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + ) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -98,7 +118,7 @@ endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all -# set(BUILD_BFLOAT16 true) + # set(BUILD_BFLOAT16 true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -143,9 +163,10 @@ endif () set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) if(MSVC) -set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) -set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) + set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) + set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) endif () + # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (SUBDIR ${SUBDIRS}) @@ -183,12 +204,61 @@ if (${DYNAMIC_ARCH}) endif () # add objects to the openblas lib -add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +if(NOT NO_LAPACK) + add_library(LAPACK OBJECT ${LA_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +if(NOT NO_LAPACKE) + add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +if(BUILD_RELAPACK) + add_library(RELAPACK OBJECT ${RELA_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +set(OpenBLAS_LIBS "") +if(BUILD_STATIC_LIBS) + add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $) + list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static) +endif() +if(BUILD_SHARED_LIBS) + add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $) + list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared) +endif() +if(BUILD_STATIC_LIBS) + add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static) +else() + add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared) +endif() + +set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) # Android needs to explicitly link against libm if(ANDROID) - target_link_libraries(${OpenBLAS_LIBNAME} m) + if(BUILD_STATIC_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_static m) + endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_shared m) + endif() +endif() + +if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) + set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + if (NOT NOFORTRAN) + set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) + set (CMAKE_Fortran_CREATE_SHARED_LIBRARY + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" + "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") + else () + set (CMAKE_C_CREATE_SHARED_LIBRARY + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") + endif () endif() # Handle MSVC exports @@ -197,21 +267,21 @@ if(MSVC AND BUILD_SHARED_LIBS) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) - set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) + set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) endif() endif() # Set output for libopenblas -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS") foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) endforeach() enable_testing() @@ -220,10 +290,17 @@ if (USE_THREAD) # Add threading library to linker find_package(Threads) if (THREADS_HAVE_PTHREAD_ARG) - set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread") - set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") + set_target_properties(${OpenBLAS_LIBS} PROPERTIES + COMPILE_OPTIONS "-pthread" + INTERFACE_COMPILE_OPTIONS "-pthread" + ) + endif() + if(BUILD_STATIC_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT}) + endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT}) endif() - target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() #if (MSVC OR NOT NOFORTRAN) @@ -239,97 +316,109 @@ if (NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) - if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) - add_subdirectory(cpp_thread_test) - endif() + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() -set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES +set_target_properties(${OpenBLAS_LIBS} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} ) if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) - target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") + target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") else() - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") endif() endif() if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") -if (NOT DEFINED ARCH) - set(ARCH_IN "x86_64") -else() - set(ARCH_IN ${ARCH}) -endif() + if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") + else() + set(ARCH_IN ${ARCH}) + endif() -if (${CORE} STREQUAL "generic") - set(ARCH_IN "GENERIC") -endif () + if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") + endif () -if (NOT DEFINED EXPRECISION) - set(EXPRECISION_IN 0) -else() - set(EXPRECISION_IN ${EXPRECISION}) -endif() + if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) + else() + set(EXPRECISION_IN ${EXPRECISION}) + endif() -if (NOT DEFINED NO_CBLAS) - set(NO_CBLAS_IN 0) -else() - set(NO_CBLAS_IN ${NO_CBLAS}) -endif() + if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) + else() + set(NO_CBLAS_IN ${NO_CBLAS}) + endif() -if (NOT DEFINED NO_LAPACK) - set(NO_LAPACK_IN 0) -else() - set(NO_LAPACK_IN ${NO_LAPACK}) -endif() + if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) + else() + set(NO_LAPACK_IN ${NO_LAPACK}) + endif() -if (NOT DEFINED NO_LAPACKE) - set(NO_LAPACKE_IN 0) -else() - set(NO_LAPACKE_IN ${NO_LAPACKE}) -endif() + if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) + else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) + endif() -if (NOT DEFINED NEED2UNDERSCORES) - set(NEED2UNDERSCORES_IN 0) -else() - set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) -endif() + if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) + else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) + endif() -if (NOT DEFINED ONLY_CBLAS) - set(ONLY_CBLAS_IN 0) -else() - set(ONLY_CBLAS_IN ${ONLY_CBLAS}) -endif() + if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) + else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) + endif() -if (NOT DEFINED BU) - set(BU _) -endif() + if (NOT DEFINED BU) + set(BU _) + endif() -if (NOT ${SYMBOLPREFIX} STREQUAL "") -message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") -endif() -if (NOT ${SYMBOLSUFFIX} STREQUAL "") -message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") -endif() - add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def - COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so - COMMENT "renaming symbols" - ) + if (NOT ${SYMBOLPREFIX} STREQUAL "") + message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") + endif() + + add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMENT "renaming symbols" + ) endif() # Install project # Install libraries -install(TARGETS ${OpenBLAS_LIBNAME} - EXPORT "OpenBLAS${SUFFIX64}Targets" - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) + install(TARGETS ${OpenBLAS_LIBNAME}_shared + EXPORT "OpenBLAS${SUFFIX64}Targets" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) + install(TARGETS ${OpenBLAS_LIBNAME}_static + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +else() + install(TARGETS ${OpenBLAS_LIBS} + EXPORT "OpenBLAS${SUFFIX64}Targets" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +endif() # Install headers set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) @@ -365,36 +454,41 @@ if(NOT NOFORTRAN) endif() if(NOT NO_CBLAS) - message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) - string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - if (NOT ${SYMBOLPREFIX} STREQUAL "") - string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - endif() - if (NOT ${SYMBOLSUFFIX} STREQUAL "") - string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - endif() - file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) + string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) - message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") - add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) - FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") - install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") + if(BUILD_STATIC_LIBS) + add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) + endif() + if(BUILD_SHARED_LIBS) + add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) + endif() + FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") + install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) - ADD_CUSTOM_TARGET(genlapacke - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" - ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) + ADD_CUSTOM_TARGET(genlapacke + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" + ) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() # Install pkg-config files @@ -419,4 +513,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) - diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6be41960c..39ec96246 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -197,3 +197,7 @@ In chronological order: * River Dillon * [2021-07-10] fix compilation with musl libc + +* Bine Brank + * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE + * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM diff --git a/Changelog.txt b/Changelog.txt index 59fe1d45e..180f7adec 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,51 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.19 + 19-Dec-2021 + + general: + - reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16 + - fixed a potential thread race in the thread buffer reallocation routines + that were introduced in 0.3.18 + - fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE + - fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG + - made automatic library suffix for CMAKE builds with INTERFACE64 available + to CBLAS-only builds + +x86_64: + - DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities + when an unknown CPUID is encountered, instead of defaulting to Prescott + - added cpu detection for Intel Alder Lake + - added cpu detection for Intel Sapphire Rapids + - added an optimized SBGEMM kernel for Sapphire Rapids + - fixed DYNAMIC_ARCH builds on OSX with CMAKE + - worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX + - fixed missing thread initialization for static builds on Windows/MSVC + - fixed an excessive read in ZSYMV + +POWER: + - added support for POWER10 in big-endian mode + - added support for building with CMAKE + - added optimized SGEMM and DGEMM kernels for small matrix sizes + +ARMV8: + - added basic support and cputype detection for Fujitsu A64FX + - added a generic ARMV8SVE target + - added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX + - added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus + - fixed cpuid detection for Apple M1 and improved performance + - improved compiler flag setting in CMAKE builds + +RISCV64: + - fixed improper initialization in CSCAL/ZSCAL for strided access patterns + +MIPS: + - added a GENERIC target for MIPS32 + - added support for cross-compiling to MIPS32 on x86_64 using CMAKE + +MIPS64: + - fixed misdetection of MSA capability + ==================================================================== Version 0.3.18 02-Oct-2021 diff --git a/Makefile b/Makefile index 49fd57ff2..1bb3f6b90 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ export NOFORTRAN export NO_LAPACK endif -LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test diff --git a/Makefile.arm64 b/Makefile.arm64 index 2656a17f9..801601030 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,6 +1,9 @@ ifneq ($(C_COMPILER), PGI) -ifneq ($(GCCVERSIONGT4), 1) +ifeq ($(C_COMPILER), CLANG) +ISCLANG=1 +endif +ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG))) CCOMMON_OPT += -march=armv8-a ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a @@ -17,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a endif endif +ifeq ($(CORE), ARMV8SVE) +CCOMMON_OPT += -march=armv8-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a+sve +endif +endif + ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 ifneq ($(F_COMPILER), NAG) @@ -48,7 +58,7 @@ endif # Use a72 tunings because Neoverse-N1 is only available # in GCC>=9 ifeq ($(CORE), NEOVERSEN1) -ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ9), 1) CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 ifneq ($(F_COMPILER), NAG) @@ -70,7 +80,7 @@ endif # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) -ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ8), 1) CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 ifneq ($(F_COMPILER), NAG) @@ -132,7 +142,7 @@ FCOMMON_OPT += -march=armv8.3-a endif endif -ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 ifneq ($(F_COMPILER), NAG) @@ -150,6 +160,15 @@ endif endif endif +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), A64FX) +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx +endif +endif endif -endif \ No newline at end of file +endif + +endif diff --git a/Makefile.rule b/Makefile.rule index 57dab1152..500b7c44f 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.18 +VERSION = 0.3.18.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index 150dbef50..97fdc3f91 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,11 +9,10 @@ ifndef TOPDIR TOPDIR = . endif - # If ARCH is not set, we use the host system's architecture for getarch compile options. -ifndef ARCH +# we need to use the host system's architecture for getarch compile options even especially when cross-compiling HOSTARCH := $(shell uname -m) -else -HOSTARCH = $(ARCH) +ifeq ($(HOSTARCH), amd64) +HOSTARCH=x86_64 endif # Catch conflicting usage of ARCH in some BSD environments @@ -102,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET ifeq ($(TARGET), GENERIC) ifeq ($(DYNAMIC_ARCH), 1) override NO_EXPRECISION=1 -export NO_EXPRECiSION +export NO_EXPRECISION endif endif endif @@ -119,6 +118,9 @@ endif ifeq ($(TARGET), COOPERLAKE) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), SAPPHIRERAPIDS) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -143,8 +145,13 @@ endif ifeq ($(TARGET), POWER8) GETARCH_FLAGS := -DFORCE_POWER6 endif +ifeq ($(TARGET), POWER9) +GETARCH_FLAGS := -DFORCE_POWER6 +endif +ifeq ($(TARGET), POWER10) +GETARCH_FLAGS := -DFORCE_POWER6 +endif endif - #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. # @@ -164,6 +171,9 @@ endif ifeq ($(TARGET_CORE), COOPERLAKE) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -251,6 +261,8 @@ endif #For small matrix optimization ifeq ($(ARCH), x86_64) SMALL_MATRIX_OPT = 1 +else ifeq ($(CORE), POWER10) +SMALL_MATRIX_OPT = 1 endif ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT @@ -260,6 +272,10 @@ endif ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 +# Determine if the assembler is GNU Assembler +HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) +GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) + # Generating Makefile.conf and config.h DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) @@ -307,7 +323,7 @@ else SMP = 1 endif else -ifeq ($(NUM_THREAD), 1) +ifeq ($(NUM_THREADS), 1) SMP = else SMP = 1 @@ -892,15 +908,25 @@ endif ifeq ($(C_COMPILER), PGI) PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) -PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) -PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) +PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) +PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) -ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) +ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) NEWPGI := 1 +PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) +PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) +PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) +ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) +NEWPGI2 := 1 +endif endif ifdef BINARY64 ifeq ($(ARCH), x86_64) +ifneq ($(NEWPGI2),1) CCOMMON_OPT += -tp p7-64 +else +CCOMMON_OPT += -tp px +endif ifneq ($(NEWPGI),1) CCOMMON_OPT += -D__MMX__ -Mnollvm endif @@ -915,7 +941,11 @@ endif endif endif else +ifneq ($(NEWPGI2),1) CCOMMON_OPT += -tp p7 +else +CCOMMON_OPT += -tp px +endif endif endif @@ -1092,8 +1122,12 @@ FCOMMON_OPT += -i8 endif endif ifeq ($(ARCH), x86_64) +ifneq ($(NEWPGI2),1) FCOMMON_OPT += -tp p7-64 else +FCOMMON_OPT += -tp px +endif +else ifeq ($(ARCH), power) ifeq ($(CORE), POWER6) $(warning NVIDIA HPC compilers do not support POWER6.) @@ -1643,8 +1677,10 @@ export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 export HAVE_NEON -export HAVE_MSA -export MSA_FLAGS +ifndef NO_MSA + export HAVE_MSA + export MSA_FLAGS +endif export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 307cbe1d9..f14a8a8ff 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -81,6 +81,40 @@ CCOMMON_OPT += -march=cooperlake ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=cooperlake endif +else # gcc not support, fallback to avx512 +CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=skylake-avx512 +endif +endif +endif +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif +endif +endif + +ifeq ($(CORE), SAPPHIRERAPIDS) +ifndef NO_AVX512 +ifeq ($(C_COMPILER), GCC) +# sapphire rapids support was added in 11 +ifeq ($(GCCVERSIONGTEQ11), 1) +CCOMMON_OPT += -march=sapphirerapids +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=sapphirerapids +endif +else # gcc not support, fallback to avx512 +CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=skylake-avx512 +endif endif endif ifeq ($(OSNAME), CYGWIN_NT) diff --git a/TargetList.txt b/TargetList.txt index 963545cdd..b02a011d5 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -23,6 +23,7 @@ HASWELL SKYLAKEX ATOM COOPERLAKE +SAPPHIRERAPIDS b)AMD CPU: ATHLON diff --git a/appveyor.yml b/appveyor.yml index d575c5b7f..96a967387 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,15 +29,15 @@ environment: global: CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 matrix: - - COMPILER: clang-cl - WITH_FORTRAN: ON - - COMPILER: clang-cl - DYNAMIC_ARCH: ON - WITH_FORTRAN: OFF - - COMPILER: cl - - COMPILER: MinGW64-gcc-7.2.0-mingw - DYNAMIC_ARCH: OFF - WITH_FORTRAN: ignore +# - COMPILER: clang-cl +# WITH_FORTRAN: ON +# - COMPILER: clang-cl +# DYNAMIC_ARCH: ON +# WITH_FORTRAN: OFF +# - COMPILER: cl +# - COMPILER: MinGW64-gcc-7.2.0-mingw +# DYNAMIC_ARCH: OFF +# WITH_FORTRAN: ignore - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 COMPILER: MinGW-gcc-6.3.0-32 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 @@ -46,6 +46,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat + - if [%COMPILER%]==[clang-cl] conda update --yes -n base conda - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 @@ -64,8 +65,8 @@ before_build: - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON .. + - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f9e79018b..710940924 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -75,7 +75,50 @@ jobs: cd utest dir openblas_utest.exe - + +- job: Windows_mingw_gmake + pool: + vmImage: 'windows-latest' + steps: + - script: | + mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL" + +- job: Windows_clang_cmake + pool: + vmImage: 'windows-latest' + steps: + - script: | + set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" + set "LIB=C:\Miniconda\Library\lib;%LIB%" + set "CPATH=C:\Miniconda\Library\include;%CPATH% + conda config --add channels conda-forge --force + conda config --set auto_update_conda false + conda install --yes ninja + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + mkdir build + cd build + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON .. + cmake --build . --config Release + ctest + +- job: Windows_flang_clang + pool: + vmImage: 'windows-latest' + steps: + - script: | + set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" + set "LIB=C:\Miniconda\Library\lib;%LIB%" + set "CPATH=C:\Miniconda\Library\include;%CPATH%" + conda config --add channels conda-forge --force + conda config --set auto_update_conda false + conda install --yes --quiet ninja flang + mkdir build + cd build + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + cmake --build . --config Release + ctest + - job: OSX_OpenMP pool: vmImage: 'macOS-10.15' @@ -122,7 +165,7 @@ jobs: make ctest -- job: OSX_OpenMP_Clang_gf_cmake +- job: OSX_dynarch_cmake pool: vmImage: 'macOS-10.15' variables: @@ -130,14 +173,12 @@ jobs: LIBRARY_PATH: /usr/local/opt/llvm/lib steps: - script: | - brew update - brew install llvm libomp mkdir build cd build - cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. - make + cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. + cmake --build . ctest - + - job: OSX_Ifort_Clang pool: vmImage: 'macOS-10.15' @@ -179,7 +220,7 @@ jobs: brew update brew install --cask android-ndk export ANDROID_NDK_HOME=/usr/local/share/android-ndk - make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 + make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 - job: OSX_IOS_ARMV8 pool: @@ -206,9 +247,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ - && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ - || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ + && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 diff --git a/benchmark/gemv.c b/benchmark/gemv.c index a0001277a..fc39f3f3d 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -125,7 +125,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 57ee5a4fb..d468eb60b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -109,7 +109,7 @@ if (${ARCH} STREQUAL "ia64") endif () endif () -if (MIPS64) +if (MIPS32 OR MIPS64) set(NO_BINARY_MODE 1) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 1794b5e5b..06bc14986 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (NO_BINARY_MODE) + if (MIPS32) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32") + set(BINARY_DEFINED 1) + endif () + if (MIPS64) if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") @@ -126,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE) endif () endif () +if (${CORE} STREQUAL SAPPHIRERAPIDS) + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") + endif() + endif () + endif () +endif () + +if (${CORE} STREQUAL A64FX) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL ARMV8SVE) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + +if (${CORE} STREQUAL POWER10) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") + else () + message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) + endif() + endif () +endif () + +if (${CORE} STREQUAL POWER9) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") + endif () + endif () +endif () + +if (${CORE} STREQUAL POWER8) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/cmake/fc.cmake b/cmake/fc.cmake index f7aa4c5c9..9feda9be3 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,11 +3,6 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (INTERFACE64) - set(SUFFIX64 64) - set(SUFFIX64_UNDERSCORE _64) -endif() - if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 09ca5eb57..efededcf3 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,214 +1,218 @@ # helper functions for the kernel CMakeLists.txt +function(SetFallback KERNEL SOURCE_PATH) + if (NOT (DEFINED ${KERNEL})) + set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE) + endif () +endfunction() -# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. macro(SetDefaultL1) - set(SAMAXKERNEL amax.S) - set(DAMAXKERNEL amax.S) - set(QAMAXKERNEL amax.S) - set(CAMAXKERNEL zamax.S) - set(ZAMAXKERNEL zamax.S) - set(XAMAXKERNEL zamax.S) - set(SAMINKERNEL amin.S) - set(DAMINKERNEL amin.S) - set(QAMINKERNEL amin.S) - set(CAMINKERNEL zamin.S) - set(ZAMINKERNEL zamin.S) - set(XAMINKERNEL zamin.S) - set(SMAXKERNEL max.S) - set(DMAXKERNEL max.S) - set(QMAXKERNEL max.S) - set(SMINKERNEL min.S) - set(DMINKERNEL min.S) - set(QMINKERNEL min.S) - set(ISAMAXKERNEL iamax.S) - set(IDAMAXKERNEL iamax.S) - set(IQAMAXKERNEL iamax.S) - set(ICAMAXKERNEL izamax.S) - set(IZAMAXKERNEL izamax.S) - set(IXAMAXKERNEL izamax.S) - set(ISAMINKERNEL iamin.S) - set(IDAMINKERNEL iamin.S) - set(IQAMINKERNEL iamin.S) - set(ICAMINKERNEL izamin.S) - set(IZAMINKERNEL izamin.S) - set(IXAMINKERNEL izamin.S) - set(ISMAXKERNEL iamax.S) - set(IDMAXKERNEL iamax.S) - set(IQMAXKERNEL iamax.S) - set(ISMINKERNEL iamin.S) - set(IDMINKERNEL iamin.S) - set(IQMINKERNEL iamin.S) - set(SASUMKERNEL asum.S) - set(DASUMKERNEL asum.S) - set(CASUMKERNEL zasum.S) - set(ZASUMKERNEL zasum.S) - set(QASUMKERNEL asum.S) - set(XASUMKERNEL zasum.S) - set(SAXPYKERNEL axpy.S) - set(DAXPYKERNEL axpy.S) - set(CAXPYKERNEL zaxpy.S) - set(ZAXPYKERNEL zaxpy.S) - set(QAXPYKERNEL axpy.S) - set(XAXPYKERNEL zaxpy.S) - set(SCOPYKERNEL copy.S) - set(DCOPYKERNEL copy.S) - set(CCOPYKERNEL zcopy.S) - set(ZCOPYKERNEL zcopy.S) - set(QCOPYKERNEL copy.S) - set(XCOPYKERNEL zcopy.S) - set(SDOTKERNEL dot.S) - set(DDOTKERNEL dot.S) - set(CDOTKERNEL zdot.S) - set(ZDOTKERNEL zdot.S) - set(QDOTKERNEL dot.S) - set(XDOTKERNEL zdot.S) - set(SNRM2KERNEL nrm2.S) - set(DNRM2KERNEL nrm2.S) - set(QNRM2KERNEL nrm2.S) - set(CNRM2KERNEL znrm2.S) - set(ZNRM2KERNEL znrm2.S) - set(XNRM2KERNEL znrm2.S) - set(SROTKERNEL rot.S) - set(DROTKERNEL rot.S) - set(QROTKERNEL rot.S) - set(CROTKERNEL zrot.S) - set(ZROTKERNEL zrot.S) - set(XROTKERNEL zrot.S) - set(SSCALKERNEL scal.S) - set(DSCALKERNEL scal.S) - set(CSCALKERNEL zscal.S) - set(ZSCALKERNEL zscal.S) - set(QSCALKERNEL scal.S) - set(XSCALKERNEL zscal.S) - set(SSWAPKERNEL swap.S) - set(DSWAPKERNEL swap.S) - set(CSWAPKERNEL zswap.S) - set(ZSWAPKERNEL zswap.S) - set(QSWAPKERNEL swap.S) - set(XSWAPKERNEL zswap.S) - set(SGEMVNKERNEL gemv_n.S) - set(SGEMVTKERNEL gemv_t.S) - set(DGEMVNKERNEL gemv_n.S) - set(DGEMVTKERNEL gemv_t.S) - set(CGEMVNKERNEL zgemv_n.S) - set(CGEMVTKERNEL zgemv_t.S) - set(ZGEMVNKERNEL zgemv_n.S) - set(ZGEMVTKERNEL zgemv_t.S) - set(QGEMVNKERNEL gemv_n.S) - set(QGEMVTKERNEL gemv_t.S) - set(XGEMVNKERNEL zgemv_n.S) - set(XGEMVTKERNEL zgemv_t.S) - set(SCABS_KERNEL ../generic/cabs.c) - set(DCABS_KERNEL ../generic/cabs.c) - set(QCABS_KERNEL ../generic/cabs.c) - set(LSAME_KERNEL ../generic/lsame.c) - set(SAXPBYKERNEL ../arm/axpby.c) - set(DAXPBYKERNEL ../arm/axpby.c) - set(CAXPBYKERNEL ../arm/zaxpby.c) - set(ZAXPBYKERNEL ../arm/zaxpby.c) - set(SSUMKERNEL sum.S) - set(DSUMKERNEL sum.S) - set(CSUMKERNEL zsum.S) - set(ZSUMKERNEL zsum.S) - set(QSUMKERNEL sum.S) - set(XSUMKERNEL zsum.S) + SetFallback(SAMAXKERNEL amax.S) + SetFallback(DAMAXKERNEL amax.S) + SetFallback(QAMAXKERNEL amax.S) + SetFallback(CAMAXKERNEL zamax.S) + SetFallback(ZAMAXKERNEL zamax.S) + SetFallback(XAMAXKERNEL zamax.S) + SetFallback(SAMINKERNEL amin.S) + SetFallback(DAMINKERNEL amin.S) + SetFallback(QAMINKERNEL amin.S) + SetFallback(CAMINKERNEL zamin.S) + SetFallback(ZAMINKERNEL zamin.S) + SetFallback(XAMINKERNEL zamin.S) + SetFallback(SMAXKERNEL max.S) + SetFallback(DMAXKERNEL max.S) + SetFallback(QMAXKERNEL max.S) + SetFallback(SMINKERNEL min.S) + SetFallback(DMINKERNEL min.S) + SetFallback(QMINKERNEL min.S) + SetFallback(ISAMAXKERNEL iamax.S) + SetFallback(IDAMAXKERNEL iamax.S) + SetFallback(IQAMAXKERNEL iamax.S) + SetFallback(ICAMAXKERNEL izamax.S) + SetFallback(IZAMAXKERNEL izamax.S) + SetFallback(IXAMAXKERNEL izamax.S) + SetFallback(ISAMINKERNEL iamin.S) + SetFallback(IDAMINKERNEL iamin.S) + SetFallback(IQAMINKERNEL iamin.S) + SetFallback(ICAMINKERNEL izamin.S) + SetFallback(IZAMINKERNEL izamin.S) + SetFallback(IXAMINKERNEL izamin.S) + SetFallback(ISMAXKERNEL iamax.S) + SetFallback(IDMAXKERNEL iamax.S) + SetFallback(IQMAXKERNEL iamax.S) + SetFallback(ISMINKERNEL iamin.S) + SetFallback(IDMINKERNEL iamin.S) + SetFallback(IQMINKERNEL iamin.S) + SetFallback(SASUMKERNEL asum.S) + SetFallback(DASUMKERNEL asum.S) + SetFallback(CASUMKERNEL zasum.S) + SetFallback(ZASUMKERNEL zasum.S) + SetFallback(QASUMKERNEL asum.S) + SetFallback(XASUMKERNEL zasum.S) + SetFallback(SAXPYKERNEL axpy.S) + SetFallback(DAXPYKERNEL axpy.S) + SetFallback(CAXPYKERNEL zaxpy.S) + SetFallback(ZAXPYKERNEL zaxpy.S) + SetFallback(QAXPYKERNEL axpy.S) + SetFallback(XAXPYKERNEL zaxpy.S) + SetFallback(SCOPYKERNEL copy.S) + SetFallback(DCOPYKERNEL copy.S) + SetFallback(CCOPYKERNEL zcopy.S) + SetFallback(ZCOPYKERNEL zcopy.S) + SetFallback(QCOPYKERNEL copy.S) + SetFallback(XCOPYKERNEL zcopy.S) + SetFallback(SDOTKERNEL dot.S) + SetFallback(DDOTKERNEL dot.S) + SetFallback(CDOTKERNEL zdot.S) + SetFallback(ZDOTKERNEL zdot.S) + SetFallback(QDOTKERNEL dot.S) + SetFallback(XDOTKERNEL zdot.S) + SetFallback(SNRM2KERNEL nrm2.S) + SetFallback(DNRM2KERNEL nrm2.S) + SetFallback(QNRM2KERNEL nrm2.S) + SetFallback(CNRM2KERNEL znrm2.S) + SetFallback(ZNRM2KERNEL znrm2.S) + SetFallback(XNRM2KERNEL znrm2.S) + SetFallback(SROTKERNEL rot.S) + SetFallback(DROTKERNEL rot.S) + SetFallback(QROTKERNEL rot.S) + SetFallback(CROTKERNEL zrot.S) + SetFallback(ZROTKERNEL zrot.S) + SetFallback(XROTKERNEL zrot.S) + SetFallback(SSCALKERNEL scal.S) + SetFallback(DSCALKERNEL scal.S) + SetFallback(CSCALKERNEL zscal.S) + SetFallback(ZSCALKERNEL zscal.S) + SetFallback(QSCALKERNEL scal.S) + SetFallback(XSCALKERNEL zscal.S) + SetFallback(SSWAPKERNEL swap.S) + SetFallback(DSWAPKERNEL swap.S) + SetFallback(CSWAPKERNEL zswap.S) + SetFallback(ZSWAPKERNEL zswap.S) + SetFallback(QSWAPKERNEL swap.S) + SetFallback(XSWAPKERNEL zswap.S) + SetFallback(SGEMVNKERNEL gemv_n.S) + SetFallback(SGEMVTKERNEL gemv_t.S) + SetFallback(DGEMVNKERNEL gemv_n.S) + SetFallback(DGEMVTKERNEL gemv_t.S) + SetFallback(CGEMVNKERNEL zgemv_n.S) + SetFallback(CGEMVTKERNEL zgemv_t.S) + SetFallback(ZGEMVNKERNEL zgemv_n.S) + SetFallback(ZGEMVTKERNEL zgemv_t.S) + SetFallback(QGEMVNKERNEL gemv_n.S) + SetFallback(QGEMVTKERNEL gemv_t.S) + SetFallback(XGEMVNKERNEL zgemv_n.S) + SetFallback(XGEMVTKERNEL zgemv_t.S) + SetFallback(SCABS_KERNEL ../generic/cabs.c) + SetFallback(DCABS_KERNEL ../generic/cabs.c) + SetFallback(QCABS_KERNEL ../generic/cabs.c) + SetFallback(LSAME_KERNEL ../generic/lsame.c) + SetFallback(SAXPBYKERNEL ../arm/axpby.c) + SetFallback(DAXPBYKERNEL ../arm/axpby.c) + SetFallback(CAXPBYKERNEL ../arm/zaxpby.c) + SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c) + SetFallback(SSUMKERNEL sum.S) + SetFallback(DSUMKERNEL sum.S) + SetFallback(CSUMKERNEL zsum.S) + SetFallback(ZSUMKERNEL zsum.S) + SetFallback(QSUMKERNEL sum.S) + SetFallback(XSUMKERNEL zsum.S) if (BUILD_BFLOAT16) - set(SHAMINKERNEL ../arm/amin.c) - set(SHAMAXKERNEL ../arm/amax.c) - set(SHMAXKERNEL ../arm/max.c) - set(SHMINKERNEL ../arm/min.c) - set(ISHAMAXKERNEL ../arm/iamax.c) - set(ISHAMINKERNEL ../arm/iamin.c) - set(ISHMAXKERNEL ../arm/imax.c) - set(ISHMINKERNEL ../arm/imin.c) - set(SHASUMKERNEL ../arm/asum.c) - set(SHAXPYKERNEL ../arm/axpy.c) - set(SHAXPBYKERNEL ../arm/axpby.c) - set(SHCOPYKERNEL ../arm/copy.c) - set(SBDOTKERNEL ../x86_64/sbdot.c) - set(SHROTKERNEL ../arm/rot.c) - set(SHSCALKERNEL ../arm/scal.c) - set(SHNRM2KERNEL ../arm/nrm2.c) - set(SHSUMKERNEL ../arm/sum.c) - set(SHSWAPKERNEL ../arm/swap.c) - set(TOBF16KERNEL ../x86_64/tobf16.c) - set(BF16TOKERNEL ../x86_64/bf16to.c) - set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) - set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) + SetFallback(SHAMINKERNEL ../arm/amin.c) + SetFallback(SHAMAXKERNEL ../arm/amax.c) + SetFallback(SHMAXKERNEL ../arm/max.c) + SetFallback(SHMINKERNEL ../arm/min.c) + SetFallback(ISHAMAXKERNEL ../arm/iamax.c) + SetFallback(ISHAMINKERNEL ../arm/iamin.c) + SetFallback(ISHMAXKERNEL ../arm/imax.c) + SetFallback(ISHMINKERNEL ../arm/imin.c) + SetFallback(SHASUMKERNEL ../arm/asum.c) + SetFallback(SHAXPYKERNEL ../arm/axpy.c) + SetFallback(SHAXPBYKERNEL ../arm/axpby.c) + SetFallback(SHCOPYKERNEL ../arm/copy.c) + SetFallback(SBDOTKERNEL ../x86_64/sbdot.c) + SetFallback(SHROTKERNEL ../arm/rot.c) + SetFallback(SHSCALKERNEL ../arm/scal.c) + SetFallback(SHNRM2KERNEL ../arm/nrm2.c) + SetFallback(SHSUMKERNEL ../arm/sum.c) + SetFallback(SHSWAPKERNEL ../arm/swap.c) + SetFallback(TOBF16KERNEL ../x86_64/tobf16.c) + SetFallback(BF16TOKERNEL ../x86_64/bf16to.c) + SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) endif () endmacro () macro(SetDefaultL2) - set(SGEMVNKERNEL ../arm/gemv_n.c) - set(SGEMVTKERNEL ../arm/gemv_t.c) - set(DGEMVNKERNEL gemv_n.S) - set(DGEMVTKERNEL gemv_t.S) - set(CGEMVNKERNEL zgemv_n.S) - set(CGEMVTKERNEL zgemv_t.S) - set(ZGEMVNKERNEL zgemv_n.S) - set(ZGEMVTKERNEL zgemv_t.S) - set(QGEMVNKERNEL gemv_n.S) - set(QGEMVTKERNEL gemv_t.S) - set(XGEMVNKERNEL zgemv_n.S) - set(XGEMVTKERNEL zgemv_t.S) - set(SGERKERNEL ../generic/ger.c) - set(DGERKERNEL ../generic/ger.c) - set(QGERKERNEL ../generic/ger.c) - set(CGERUKERNEL ../generic/zger.c) - set(CGERCKERNEL ../generic/zger.c) - set(ZGERUKERNEL ../generic/zger.c) - set(ZGERCKERNEL ../generic/zger.c) - set(XGERUKERNEL ../generic/zger.c) - set(XGERCKERNEL ../generic/zger.c) - set(SSYMV_U_KERNEL ../generic/symv_k.c) - set(SSYMV_L_KERNEL ../generic/symv_k.c) - set(DSYMV_U_KERNEL ../generic/symv_k.c) - set(DSYMV_L_KERNEL ../generic/symv_k.c) - set(QSYMV_U_KERNEL ../generic/symv_k.c) - set(QSYMV_L_KERNEL ../generic/symv_k.c) - set(CSYMV_U_KERNEL ../generic/zsymv_k.c) - set(CSYMV_L_KERNEL ../generic/zsymv_k.c) - set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) - set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) - set(XSYMV_U_KERNEL ../generic/zsymv_k.c) - set(XSYMV_L_KERNEL ../generic/zsymv_k.c) - set(CHEMV_U_KERNEL ../generic/zhemv_k.c) - set(CHEMV_L_KERNEL ../generic/zhemv_k.c) - set(CHEMV_V_KERNEL ../generic/zhemv_k.c) - set(CHEMV_M_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) - set(XHEMV_U_KERNEL ../generic/zhemv_k.c) - set(XHEMV_L_KERNEL ../generic/zhemv_k.c) - set(XHEMV_V_KERNEL ../generic/zhemv_k.c) - set(XHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(SGEMVNKERNEL ../arm/gemv_n.c) + SetFallback(SGEMVTKERNEL ../arm/gemv_t.c) + SetFallback(DGEMVNKERNEL gemv_n.S) + SetFallback(DGEMVTKERNEL gemv_t.S) + SetFallback(CGEMVNKERNEL zgemv_n.S) + SetFallback(CGEMVTKERNEL zgemv_t.S) + SetFallback(ZGEMVNKERNEL zgemv_n.S) + SetFallback(ZGEMVTKERNEL zgemv_t.S) + SetFallback(QGEMVNKERNEL gemv_n.S) + SetFallback(QGEMVTKERNEL gemv_t.S) + SetFallback(XGEMVNKERNEL zgemv_n.S) + SetFallback(XGEMVTKERNEL zgemv_t.S) + SetFallback(SGERKERNEL ../generic/ger.c) + SetFallback(DGERKERNEL ../generic/ger.c) + SetFallback(QGERKERNEL ../generic/ger.c) + SetFallback(CGERUKERNEL ../generic/zger.c) + SetFallback(CGERCKERNEL ../generic/zger.c) + SetFallback(ZGERUKERNEL ../generic/zger.c) + SetFallback(ZGERCKERNEL ../generic/zger.c) + SetFallback(XGERUKERNEL ../generic/zger.c) + SetFallback(XGERCKERNEL ../generic/zger.c) + SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c) if (BUILD_BFLOAT16) - set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) - set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) - set(SHGERKERNEL ../generic/ger.c) + SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) + SetFallback(SHGERKERNEL ../generic/ger.c) endif () endmacro () macro(SetDefaultL3) - set(SGEADD_KERNEL ../generic/geadd.c) - set(DGEADD_KERNEL ../generic/geadd.c) - set(CGEADD_KERNEL ../generic/zgeadd.c) - set(ZGEADD_KERNEL ../generic/zgeadd.c) + SetFallback(SGEADD_KERNEL ../generic/geadd.c) + SetFallback(DGEADD_KERNEL ../generic/geadd.c) + SetFallback(CGEADD_KERNEL ../generic/zgeadd.c) + SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c) if (BUILD_BFLOAT16) - set(SHGEADD_KERNEL ../generic/geadd.c) - set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) - set(SBGEMM_BETA ../generic/gemm_beta.c) - set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) - set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) - set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) - set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) - set(SBGEMMINCOPYOBJ sbgemm_incopy.o) - set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) - set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) - set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) + SetFallback(SHGEADD_KERNEL ../generic/geadd.c) + SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) + SetFallback(SBGEMM_BETA ../generic/gemm_beta.c) + SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o) + SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o) + SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o) + SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) endif () endmacro () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index d86e10035..259d9c738 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -416,7 +416,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) -elseif ("${TCORE}" STREQUAL "VORTEX") + elseif ("${TCORE}" STREQUAL "VORTEX") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" "#define L1_CODE_SIZE\t32768\n" @@ -439,6 +439,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX") set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "P5600") + file(APPEND ${TARGET_CONF_TEMP} + "#define L2_SIZE 1048576\n" + "#define DTB_SIZE 4096\n" + "#define DTB_DEFAULT_ENTRIES 64\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 16) + elseif ("${TCORE}" MATCHES "MIPS") + file(APPEND ${TARGET_CONF_TEMP} + "#define L2_SIZE 262144\n" + "#define DTB_SIZE 4096\n" + "#define DTB_DEFAULT_ENTRIES 64\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "POWER6") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index f56ded966..e0e92bde7 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") @@ -42,6 +42,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") set(TARGET "ARMV7") endif () + if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10") + set(TARGET "POWER6") + endif () endif () @@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc) set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") endif () +if (POWER) + set(NO_WARMUP 1) + set(HAVE_GAS 1) + if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU") + set(HAVE_GAS 0) + elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang") + set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as") + set(HAVE_GAS 0) + endif () + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") +endif () + #if don't use Fortran, it will only compile CBLAS. if (ONLY_CBLAS) set(NO_LAPACK 1) @@ -163,6 +178,22 @@ if (DEFINED TARGET) endif() endif() endif() + if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + endif() + endif() if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() @@ -206,6 +237,27 @@ if (DEFINED TARGET) if (DEFINED HAVE_SSE4_1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") endif() + + if (${TARGET} STREQUAL POWER10) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") + else () + message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") + endif() + endif() + if (${TARGET} STREQUAL POWER9) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") + else () + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") + endif() + endif() + if (${TARGET} STREQUAL POWER8) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + endif() endif() if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") @@ -223,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") +if (INTERFACE64) + set(SUFFIX64 64) + set(SUFFIX64_UNDERSCORE _64) +endif() + if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") @@ -258,7 +315,7 @@ if (NEED_PIC) endif() endif () -if (X86_64) +if (X86_64 OR ${CORE} STREQUAL POWER10) set(SMALL_MATRIX_OPT TRUE) endif () if (SMALL_MATRIX_OPT) @@ -266,7 +323,7 @@ if (SMALL_MATRIX_OPT) endif () if (DYNAMIC_ARCH) - if (X86 OR X86_64 OR ARM64 OR PPC) + if (X86 OR X86_64 OR ARM64 OR POWER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") if (DYNAMIC_OLDER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 8d0558c0e..86ce3dfb0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -20,11 +20,11 @@ endif() -if(CMAKE_COMPILER_IS_GNUCC AND WIN32) +if(MINGW) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine - OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE + OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE OUTPUT_STRIP_TRAILING_WHITESPACE) - if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") set(MINGW64 1) endif() endif() @@ -35,7 +35,7 @@ if(CMAKE_CL_64 OR MINGW64) elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") - set(PPC 1) + set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") @@ -73,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING}) else () set(X86 1) endif() + elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*") + set(MIPS32 1) elseif (${TARGET} STREQUAL "ARMV7") set(ARM 1) else() @@ -86,8 +88,12 @@ if (X86_64) set(ARCH "x86_64") elseif(X86) set(ARCH "x86") -elseif(PPC) +elseif(POWER) set(ARCH "power") +elseif(MIPS32) + set(ARCH "mips") +elseif(MIPS64) + set(ARCH "mips64") elseif(ARM) set(ARCH "arm") elseif(ARM64) @@ -97,7 +103,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) + if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) set(BINARY 64) else () set(BINARY 32) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 01b489f2a..c5ee65384 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,35 +15,83 @@ endfunction () # Reads a Makefile into CMake vars. macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") - set (IfElse 0) - set (ElseSeen 0) + set (C_COMPILER ${CMAKE_C_COMPILER_ID}) + set (IfElse 0) + set (ElseSeen 0) + set (SkipIfs 0) + set (SkipElse 0) file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) -#message(STATUS "parsing ${makefile_line}") - if (${IfElse} GREATER 0) + #message(STATUS "parsing ${makefile_line}") + # Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition. + # The variable SkipIfs is used to identify which endif statement closes the scope of the else statement. + if (${SkipElse} EQUAL 1) + #message(STATUS "skipping ${makefile_line}") + string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + MATH(EXPR SkipIfs "${SkipIfs}+1") + endif () string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") -# message(STATUS "ENDIF ${makefile_line}") - set (IfElse 0) - set (ElseSeen 0) + if (${SkipIfs} EQUAL 0) + set (SkipElse 0) + else () + MATH(EXPR SkipIfs "${SkipIfs}-1") + endif () + endif () + continue () + endif () + # The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement. + if (${IfElse} GREATER 0) + # If the current scope is the one that has to be skipped, the if/endif/else statements + # along with it till the endif that closes the current scope have to be ignored as well. + string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) + #message(STATUS "skipping ${makefile_line}") + MATH(EXPR SkipIfs "${SkipIfs}+1") + continue () + endif () + endif () + string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + if (${SkipIfs} EQUAL 0) + #message(STATUS "ENDIF ${makefile_line}") + set (IfElse 0) + set (ElseSeen 0) + else () + #message(STATUS "skipping ${makefile_line}") + MATH(EXPR SkipIfs "${SkipIfs}-1") + endif () continue () endif () string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "ELSE ${makefile_line}") - set (ElseSeen 1) - continue () - endif() - if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) -# message(STATUS "skipping ${makefile_line}") - continue () + if (NOT "${line_match}" STREQUAL "") + if (${SkipIfs} EQUAL 0) + #message(STATUS "ELSE ${makefile_line}") + set (ElseSeen 1) + else () + #message(STATUS "skipping ${makefile_line}") + endif () + continue () + endif() + # Skip the lines that are not part of the path that has to be taken. + if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0)) + #message(STATUS "skipping ${makefile_line}") + continue () endif () - endif () + endif () + # Skip commented lines (the ones that start with '#') + string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "skipping ${makefile_line}") + continue () + endif () string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") -#message(STATUS "match on ${line_match}") + #message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) -# set(var_value ${CMAKE_MATCH_2}) + #set(var_value ${CMAKE_MATCH_2}) string(STRIP ${CMAKE_MATCH_2} var_value) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) @@ -54,39 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) endforeach () set(${var_name} ${var_value}) - else () - string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -#message(STATUS "match on include ${line_match}") - ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) - else () -# message(STATUS "unmatched line ${line_match}") - string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) -# message (STATUS "condition is true") - set (IfElse 1) - else () - set (IfElse 2) - endif () + continue () + endif () + # Include a new file to be parsed + string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "match on include ${line_match}") + ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + continue () + endif () + # The if statement that precedes this else has the path taken + # Thus, this else statement has to be skipped. + string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "skipping ${makefile_line}") + set (SkipElse 1) + continue() + endif() + # Example 1: ifdef HAVE_MSA + # Example 2: ifndef ZNRM2KERNEL + string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") + set (ElseSeen 0) + if (DEFINED ${CMAKE_MATCH_2}) + if (${CMAKE_MATCH_1} STREQUAL "ifdef") + #message (STATUS "condition is true") + set (IfElse 1) else () - string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) - set (CMAKE_MATCH_1 CMAKE_C_COMPILER) - endif () - if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) -# message (STATUS "condition is true") - set (IfElse 1) - else () - set (IfElse 2) - endif () - endif () + set (IfElse 2) + endif () + else () + if (${CMAKE_MATCH_1} STREQUAL "ifdef") + set (IfElse 2) + else () + #message (STATUS "condition is true") + set (IfElse 1) endif () endif () + continue () endif () + # Example 1: ifeq ($(SGEMM_UNROLL_M), 16) + # Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) + # Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) + # Ignore the second group since (?:...) does not work on cmake + string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}") + if (DEFINED ${CMAKE_MATCH_1}) + if (DEFINED ${CMAKE_MATCH_4}) + set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}}) + else () + set (STR ${${CMAKE_MATCH_1}}) + endif () + if (${STR} STREQUAL ${CMAKE_MATCH_5}) + #message (STATUS "condition is true") + set (IfElse 1) + continue () + endif () + endif () + set (IfElse 2) + continue () + endif () + # Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + # Example 2 (Group 4): ifneq ($(C_COMPILER), PGI) + string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}") + set (ElseSeen 0) + set (HasValidGroup 0) + if (DEFINED ${CMAKE_MATCH_3}) + set (HasValidGroup 1) + set (STR ${${CMAKE_MATCH_3}}) + elseif (NOT ${CMAKE_MATCH_4} STREQUAL "") + set (HasValidGroup 1) + set (STR ${CMAKE_MATCH_4}) + endif () + if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) + if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) + #message (STATUS "condition is true") + set (IfElse 1) + continue () + endif () + endif () + set (IfElse 2) + continue () + endif () + #message(STATUS "unmatched line ${line_match}") endforeach () endmacro () diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 81e3470ef..be8313e65 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -1,13 +1,14 @@ -include ../Makefile.rule +TOPDIR = .. +include $(TOPDIR)/Makefile.system all :: dgemv_tester dgemm_tester dgemv_tester : - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester ./dgemv_tester dgemm_tester : dgemv_tester - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester ./dgemm_tester clean :: diff --git a/cpuid.h b/cpuid.h index 2c43922e7..55478893c 100644 --- a/cpuid.h +++ b/cpuid.h @@ -120,6 +120,7 @@ #define CORE_SKYLAKEX 28 #define CORE_DHYANA 29 #define CORE_COOPERLAKE 30 +#define CORE_SAPPHIRERAPIDS 31 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -145,6 +146,7 @@ #define HAVE_AVX512VL (1 << 21) #define HAVE_AVX2 (1 << 22) #define HAVE_AVX512BF16 (1 << 23) +#define HAVE_AMXBF16 (1 << 24) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -222,6 +224,7 @@ typedef struct { #define CPUTYPE_SKYLAKEX 52 #define CPUTYPE_DHYANA 53 #define CPUTYPE_COOPERLAKE 54 +#define CPUTYPE_SAPPHIRERAPIDS 55 #define CPUTYPE_HYGON_UNKNOWN 99 diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 2a9399f7d..958e94abc 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -26,10 +26,12 @@ *****************************************************************************/ #include -#ifdef OS_DARWIN +#ifdef __APPLE__ #include int32_t value; size_t length=sizeof(value); +int64_t value64; +size_t length64=sizeof(value64); #endif #define CPU_UNKNOWN 0 @@ -53,6 +55,8 @@ size_t length=sizeof(value); #define CPU_EMAG8180 10 // Apple #define CPU_VORTEX 13 +// Fujitsu +#define CPU_A64FX 15 static char *cpuname[] = { "UNKNOWN", @@ -69,7 +73,8 @@ static char *cpuname[] = { "NEOVERSEN1", "THUNDERX3T110", "VORTEX", - "CORTEXA55" + "CORTEXA55", + "A64FX" }; static char *cpuname_lower[] = { @@ -87,7 +92,8 @@ static char *cpuname_lower[] = { "neoversen1", "thunderx3t110", "vortex", - "cortexa55" + "cortexa55", + "a64fx" }; int get_feature(char *search) @@ -183,6 +189,9 @@ int detect(void) // Ampere else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) return CPU_EMAG8180; + // Fujitsu + else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) + return CPU_A64FX; } p = (char *) NULL ; @@ -212,9 +221,9 @@ int detect(void) } #else -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967) return CPU_VORTEX; + if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; #endif return CPU_ARMV8; #endif @@ -265,7 +274,7 @@ int n=0; printf("#define NUM_CORES %d\n",n); #endif -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); printf("#define NUM_CORES %d\n",value); #endif @@ -285,154 +294,166 @@ void get_cpuconfig(void) switch (d) { - case CPU_CORTEXA53: - case CPU_CORTEXA55: - printf("#define %s\n", cpuname[d]); - // Fall-through - case CPU_ARMV8: - // Minimum parameters for ARMv8 (based on A53) - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); + case CPU_CORTEXA53: + case CPU_CORTEXA55: + printf("#define %s\n", cpuname[d]); + // Fall-through + case CPU_ARMV8: + // Minimum parameters for ARMv8 (based on A53) + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); break; - case CPU_CORTEXA57: - case CPU_CORTEXA72: - case CPU_CORTEXA73: + case CPU_CORTEXA57: + case CPU_CORTEXA72: + case CPU_CORTEXA73: // Common minimum settings for these Arm cores // Can change a lot, but we need to be conservative // TODO: detect info from /sys if possible - printf("#define %s\n", cpuname[d]); - printf("#define L1_CODE_SIZE 49152\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_CODE_ASSOCIATIVE 3\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L1_DATA_ASSOCIATIVE 2\n"); - printf("#define L2_SIZE 524288\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; - case CPU_NEOVERSEN1: - printf("#define %s\n", cpuname[d]); - printf("#define L1_CODE_SIZE 65536\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_CODE_ASSOCIATIVE 4\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L1_DATA_ASSOCIATIVE 4\n"); - printf("#define L2_SIZE 1048576\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 49152\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 3\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 2\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; + case CPU_NEOVERSEN1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; - case CPU_FALKOR: - printf("#define FALKOR\n"); - printf("#define L1_CODE_SIZE 65536\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 128\n"); - printf("#define L2_SIZE 524288\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + case CPU_FALKOR: + printf("#define FALKOR\n"); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; - case CPU_THUNDERX: - printf("#define THUNDERX\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 128\n"); - printf("#define L2_SIZE 16777216\n"); - printf("#define L2_LINESIZE 128\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + case CPU_THUNDERX: + printf("#define THUNDERX\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 16777216\n"); + printf("#define L2_LINESIZE 128\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; - case CPU_THUNDERX2T99: - printf("#define THUNDERX2T99 \n"); - printf("#define L1_CODE_SIZE 32768 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 262144 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 33554432 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; + case CPU_THUNDERX2T99: + printf("#define THUNDERX2T99 \n"); + printf("#define L1_CODE_SIZE 32768 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 262144 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 33554432 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; - case CPU_TSV110: - printf("#define TSV110 \n"); - printf("#define L1_CODE_SIZE 65536 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 4 \n"); - printf("#define L1_DATA_SIZE 65536 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 4 \n"); - printf("#define L2_SIZE 524228 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; + case CPU_TSV110: + printf("#define TSV110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 4 \n"); + printf("#define L1_DATA_SIZE 65536 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 4 \n"); + printf("#define L2_SIZE 524228 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; - case CPU_EMAG8180: - // Minimum parameters for ARMv8 (based on A53) - printf("#define EMAG8180\n"); - printf("#define L1_CODE_SIZE 32768\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; + case CPU_EMAG8180: + // Minimum parameters for ARMv8 (based on A53) + printf("#define EMAG8180\n"); + printf("#define L1_CODE_SIZE 32768\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; - case CPU_THUNDERX3T110: - printf("#define THUNDERX3T110 \n"); - printf("#define L1_CODE_SIZE 65536 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 524288 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 94371840 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; -#ifdef DARWIN - case CPU_VORTEX: - printf("#define VORTEX \n"); - sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); - printf("#define L1_CODE_SIZE %d \n",value); - sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); - printf("#define L1_CODE_LINESIZE %d \n",value); - sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); - printf("#define L1_DATA_SIZE %d \n",value); - sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); - printf("#define L2_SIZE %d \n",value); - break; + case CPU_THUNDERX3T110: + printf("#define THUNDERX3T110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 524288 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 94371840 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; +#ifdef __APPLE__ + case CPU_VORTEX: + printf("#define VORTEX \n"); + sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_SIZE %lld \n",value64); + sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_LINESIZE %lld \n",value64); + sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); + printf("#define L1_DATA_SIZE %lld \n",value64); + sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); + printf("#define L2_SIZE %lld \n",value64); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; #endif + case CPU_A64FX: + printf("#define A64FX\n"); + printf("#define L1_CODE_SIZE 65535\n"); + printf("#define L1_DATA_SIZE 65535\n"); + printf("#define L1_DATA_LINESIZE 256\n"); + printf("#define L2_SIZE 8388608\n"); + printf("#define L2_LINESIZE 256\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; } get_cpucount(); } diff --git a/cpuid_mips.c b/cpuid_mips.c index e6e837f73..1946455d8 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -165,6 +165,7 @@ void get_cpuconfig(void){ }else{ printf("#define UNKNOWN\n"); } + if (!get_feature(msa)) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -178,3 +179,38 @@ void get_libname(void){ printf("mips\n"); } } + +int get_feature(char *search) +{ + +#ifdef __linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if( p == NULL ) return 0; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 674b65908..97743bc43 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -104,17 +104,17 @@ int detect(void){ } } fclose(infile); - if(p != NULL){ - if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ - return CPU_LOONGSON3R3; - }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ - return CPU_LOONGSON3R4; - } else{ - return CPU_SICORTEX; + if (p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + } else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; + } } #endif return CPU_UNKNOWN; - } } char *get_corename(void){ @@ -201,6 +201,7 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); } + if (!get_feature(msa)) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -218,3 +219,38 @@ void get_libname(void){ printf("mips64\n"); } } + +int get_feature(char *search) +{ + +#ifdef __linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if( p == NULL ) return 0; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + diff --git a/cpuid_x86.c b/cpuid_x86.c index 5aa49055a..72e95214e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1,3 +1,4 @@ +//{ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -266,6 +267,31 @@ int support_avx512_bf16(){ #endif } +#define BIT_AMX_TILE 0x01000000 +#define BIT_AMX_BF16 0x00400000 +#define BIT_AMX_ENBD 0x00060000 + +int support_amx_bf16() { +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + // CPUID.7.0:EDX indicates AMX support + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); + if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { + // CPUID.D.0:EAX[17:18] indicates AMX enabled + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) + ret = 1; + } + return ret; +#else + return 0; +#endif +} + int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; @@ -353,6 +379,7 @@ int get_cputype(int gettype){ if (support_avx2()) feature |= HAVE_AVX2; if (support_avx512()) feature |= HAVE_AVX512VL; if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; + if (support_amx_bf16()) feature |= HAVE_AMXBF16; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1429,10 +1456,10 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 9: case 8: switch (model) { case 12: // Tiger Lake + case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz) if(support_avx512()) return CPUTYPE_SKYLAKEX; if(support_avx2()) @@ -1448,30 +1475,70 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; - } - case 10: //family 6 exmodel 10 - switch (model) { - case 5: // Comet Lake H and S - case 6: // Comet Lake U - if(support_avx2()) - return CPUTYPE_HASWELL; - if(support_avx()) - return CPUTYPE_SANDYBRIDGE; - else - return CPUTYPE_NEHALEM; - case 7: // Rocket Lake - if(support_avx512()) + case 15: // Sapphire Rapids + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) return CPUTYPE_SKYLAKEX; if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) return CPUTYPE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; - } - break; - } + return CPUTYPE_NEHALEM; + } break; + case 9: + switch (model) { + case 7: // Alder Lake desktop + case 10: // Alder Lake mobile + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 13: // Ice Lake NNPI + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 14: // Kaby Lake and refreshes + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; + case 10: //family 6 exmodel 10 + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 7: // Rocket Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; + } + break; case 0x7: return CPUTYPE_ITANIUM; case 0xf: @@ -2042,32 +2109,7 @@ int get_coretype(void){ return CORE_NEHALEM; } break; - case 10: - switch (model) { - case 5: // Comet Lake H and S - case 6: // Comet Lake U - if(support_avx()) - #ifndef NO_AVX2 - return CORE_HASWELL; - #else - return CORE_SANDYBRIDGE; - #endif - else - return CORE_NEHALEM; - case 7:// Rocket Lake -#ifndef NO_AVX512 - if(support_avx512()) - return CORE_SKYLAKEX; -#endif -#ifndef NO_AVX2 - if(support_avx2()) - return CORE_HASWELL; -#endif - if(support_avx()) - return CORE_SANDYBRIDGE; - else - return CORE_NEHALEM; - } + case 5: switch (model) { case 6: @@ -2121,6 +2163,7 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 6: if (model == 6) #ifndef NO_AVX512 @@ -2135,7 +2178,7 @@ int get_coretype(void){ else return CORE_NEHALEM; #endif - if (model == 10) + if (model == 10 || model == 12) #ifndef NO_AVX512 if(support_avx512_bf16()) return CORE_COOPERLAKE; @@ -2151,10 +2194,11 @@ int get_coretype(void){ return CORE_NEHALEM; #endif break; + case 7: if (model == 10) return CORE_NEHALEM; - if (model == 14) + if (model == 13 || model == 14) // Ice Lake #ifndef NO_AVX512 return CORE_SKYLAKEX; #else @@ -2168,9 +2212,9 @@ int get_coretype(void){ return CORE_NEHALEM; #endif break; - case 9: + case 8: - if (model == 12) { // Tiger Lake + if (model == 12 || model == 13) { // Tiger Lake if(support_avx512()) return CORE_SKYLAKEX; if(support_avx2()) @@ -2180,7 +2224,7 @@ int get_coretype(void){ else return CORE_NEHALEM; } - if (model == 14) { // Kaby Lake + if (model == 14) { // Kaby Lake mobile if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; @@ -2190,12 +2234,82 @@ int get_coretype(void){ else return CORE_NEHALEM; } - } + if (model == 15) { // Sapphire Rapids + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } break; + case 9: + if (model == 7 || model == 10) { // Alder Lake + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + if (model == 13) { // Ice Lake NNPI + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + if (model == 14) { // Kaby Lake desktop + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + } + break; + + case 10: + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx()) + #ifndef NO_AVX2 + return CORE_HASWELL; + #else + return CORE_SANDYBRIDGE; + #endif + else + return CORE_NEHALEM; + case 7:// Rocket Lake +#ifndef NO_AVX512 + if(support_avx512()) + return CORE_SKYLAKEX; +#endif +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + case 15: if (model <= 0x2) return CORE_NORTHWOOD; else return CORE_PRESCOTT; + } } } @@ -2389,6 +2503,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); + if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2460,9 +2575,11 @@ void get_sse(void){ if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); + if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); } +//} \ No newline at end of file diff --git a/cpuid_zarch.c b/cpuid_zarch.c index df3b7898f..a6b953dd9 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,57 +27,11 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 -#define CPU_Z15 3 +#include "cpuid_zarch.h" -static char *cpuname[] = { - "ZARCH_GENERIC", - "Z13", - "Z14", - "Z15" -}; - -static char *cpuname_lower[] = { - "zarch_generic", - "z13", - "z14", - "z15" -}; - -int detect(void) -{ - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - if (strstr(p, "3906")) return CPU_Z14; - if (strstr(p, "3907")) return CPU_Z14; - if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 - if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 - - return CPU_GENERIC; -} void get_libname(void) { - int d = detect(); printf("%s", cpuname_lower[d]); } diff --git a/cpuid_zarch.h b/cpuid_zarch.h new file mode 100644 index 000000000..686f2eb17 --- /dev/null +++ b/cpuid_zarch.h @@ -0,0 +1,101 @@ +#include + +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 +#define CPU_Z15 3 + +static char *cpuname[] = { + "ZARCH_GENERIC", + "Z13", + "Z14", + "Z15" +}; + +static char *cpuname_lower[] = { + "zarch_generic", + "z13", + "z14", + "z15" +}; + +// Guard the use of getauxval() on glibc version >= 2.16 +#ifdef __GLIBC__ +#include +#if __GLIBC_PREREQ(2, 16) +#include +#define HAVE_GETAUXVAL 1 + +static unsigned long get_hwcap(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + char *maskenv; + + // honor requests for not using specific CPU features in LD_HWCAP_MASK + maskenv = getenv("LD_HWCAP_MASK"); + if (maskenv) + hwcap &= strtoul(maskenv, NULL, 0); + + return hwcap; + // note that a missing auxval is interpreted as no capabilities + // available, which is safe. +} + +#else // __GLIBC_PREREQ(2, 16) +#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" + +static unsigned long get_hwcap(void) { + // treat missing support for getauxval() as no capabilities available, + // which is safe. + return 0; +} +#endif // __GLIBC_PREREQ(2, 16) +#endif // __GLIBC + +static int detect(void) +{ + unsigned long hwcap = get_hwcap(); + + // Choose the architecture level for optimized kernels based on hardware + // capability bits (just like glibc chooses optimized implementations). + // + // The hardware capability bits that are used here indicate both + // hardware support for a particular ISA extension and the presence of + // software support to enable its use. For example, when HWCAP_S390_VX + // is set then both the CPU can execute SIMD instructions and the Linux + // kernel can manage applications using the vector registers and SIMD + // instructions. + // + // See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in + // sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware + // capability bits. They are derived from the information that the + // "store facility list (extended)" instructions provide. + // (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD) + // + // currently used: + // HWCAP_S390_VX - vector facility for z/Architecture (introduced with + // IBM z13), enables level CPU_Z13 (SIMD) + // HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM + // z14), together with VX enables level CPU_Z14 + // (single-precision SIMD instructions) + // + // When you add optimized kernels that make use of other ISA extensions + // (e.g., for exploiting the vector-enhancements facility 2 that was introduced + // with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate + // it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2 + // for the z15 vector enhancements). + // + // To learn the value of hwcaps on a given system, set the environment + // variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running + // LD_SHOW_AUXV=1 /bin/true). + // Also, the init function for dynamic arch support will print hwcaps + // when OPENBLAS_VERBOSE is set to 2 or higher. + if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + return CPU_Z14; + + if (hwcap & HWCAP_S390_VX) + return CPU_Z13; + + return CPU_GENERIC; +} + diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 9b44deb85..4a8e193be 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 2b33c9589..dfc7107b8 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 880de4df4..e25ea7afe 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 3be43edde..ab9cdfae8 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index a07e00b3b..1a38740a3 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" if (DYNAMIC_ARCH) if (ARM64) list(APPEND COMMON_SOURCES dynamic_arm64.c) + elseif (POWER) + list(APPEND COMMON_SOURCES dynamic_power.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 42f289441..33b58f134 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -40,7 +40,7 @@ #include #include "common.h" -#if defined(OS_CYGWIN_NT) && !defined(unlikely) +#if !defined(unlikely) #ifdef __GNUC__ #define unlikely(x) __builtin_expect(!!(x), 0) #else @@ -391,8 +391,9 @@ int blas_thread_init(void){ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ -#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) +#if defined(SMP_SERVER) // Handle lazy re-init of the thread-pool after a POSIX fork + // on Cygwin or as delayed init when a static library is used if (unlikely(blas_server_avail == 0)) blas_thread_init(); #endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 071788a9b..b12fb069a 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -624,7 +624,7 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } } - if (model == 10) { + if (model == 10 || model == 12){ // Ice Lake SP if(support_avx512_bf16()) return &gotoblas_COOPERLAKE; @@ -639,12 +639,12 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } - } + } return NULL; case 7: if (model == 10) // Goldmont Plus return &gotoblas_NEHALEM; - if (model == 14) { + if (model == 13 || model == 14) { // Ice Lake if (support_avx512()) return &gotoblas_SKYLAKEX; @@ -661,9 +661,8 @@ static gotoblas_t *get_coretype(void){ } } return NULL; - case 9: case 8: - if (model == 12) { // Tiger Lake + if (model == 12 || model == 13) { // Tiger Lake if (support_avx512()) return &gotoblas_SKYLAKEX; if(support_avx2()){ @@ -689,6 +688,50 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + if (model == 15){ // Sapphire Rapids + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + + + case 9: + if (model == 7 || model == 10) { // Alder Lake + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + if (model == 14 ) { // Kaby Lake, Coffee Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; case 10: if (model == 5 || model == 6) { if(support_avx2()) @@ -1018,7 +1061,13 @@ void gotoblas_dynamic_init(void) { #ifdef ARCH_X86 if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else - if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + if (gotoblas == NULL) { + if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE; + else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX; + else if (support_avx2()) gotoblas = &gotoblas_HASWELL; + else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE; + else gotoblas = &gotoblas_PRESCOTT; + } /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ if (sizeof(void*) == 8) { if (gotoblas == &gotoblas_KATMAI || diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index bf5eab9b2..5b45aae2f 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,38 +1,7 @@ #include "common.h" +#include "cpuid_zarch.h" #include -// Guard the use of getauxval() on glibc version >= 2.16 -#ifdef __GLIBC__ -#include -#if __GLIBC_PREREQ(2, 16) -#include -#define HAVE_GETAUXVAL 1 - -static unsigned long get_hwcap(void) -{ - unsigned long hwcap = getauxval(AT_HWCAP); - char *maskenv; - - // honor requests for not using specific CPU features in LD_HWCAP_MASK - maskenv = getenv("LD_HWCAP_MASK"); - if (maskenv) - hwcap &= strtoul(maskenv, NULL, 0); - - return hwcap; - // note that a missing auxval is interpreted as no capabilities - // available, which is safe. -} - -#else // __GLIBC_PREREQ(2, 16) -#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" - -static unsigned long get_hwcap(void) { - // treat missing support for getauxval() as no capabilities available, - // which is safe. - return 0; -} -#endif // __GLIBC_PREREQ(2, 16) -#endif // __GLIBC extern gotoblas_t gotoblas_ZARCH_GENERIC; #ifdef DYN_Z13 @@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14; #define NUM_CORETYPES 4 +extern int openblas_verbose(); extern void openblas_warning(int verbose, const char* msg); -static char* corename[] = { - "unknown", - "Z13", - "Z14", - "ZARCH_GENERIC", -}; - char* gotoblas_corename(void) { #ifdef DYN_Z13 - if (gotoblas == &gotoblas_Z13) return corename[1]; + if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13]; #endif #ifdef DYN_Z14 - if (gotoblas == &gotoblas_Z14) return corename[2]; + if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14]; #endif - if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; + if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC]; - return corename[0]; + return "unknown"; } #ifndef HWCAP_S390_VXE @@ -79,25 +42,28 @@ char* gotoblas_corename(void) { */ static gotoblas_t* get_coretype(void) { - unsigned long hwcap __attribute__((unused)) = get_hwcap(); + int cpu = detect(); -#ifdef DYN_Z14 + switch(cpu) { // z14 and z15 systems: exploit Vector Facility (SIMD) and // Vector-Enhancements Facility 1 (float SIMD instructions), if present. - if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + case CPU_Z14: +#ifdef DYN_Z14 return &gotoblas_Z14; #endif -#ifdef DYN_Z13 // z13: Vector Facility (SIMD for double) - if (hwcap & HWCAP_S390_VX) + case CPU_Z13: +#ifdef DYN_Z13 return &gotoblas_Z13; #endif + default: // fallback in case of missing compiler support, systems before z13, or // when the OS does not advertise support for the Vector Facility (e.g., // missing support in the OS kernel) - return &gotoblas_ZARCH_GENERIC; + return &gotoblas_ZARCH_GENERIC; + } } static gotoblas_t* force_coretype(char* coretype) { @@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) { for (i = 0; i < NUM_CORETYPES; i++) { - if (!strncasecmp(coretype, corename[i], 20)) + if (!strncasecmp(coretype, cpuname[i], 20)) { found = i; break; } } - if (found == 1) { + if (found == CPU_Z13) { #ifdef DYN_Z13 return &gotoblas_Z13; #else openblas_warning(1, "Z13 support not compiled in"); return NULL; #endif - } else if (found == 2) { + } else if (found == CPU_Z14) { #ifdef DYN_Z14 return &gotoblas_Z14; #else openblas_warning(1, "Z14 support not compiled in"); return NULL; #endif - } else if (found == 3) { + } else if (found == CPU_GENERIC) { return &gotoblas_ZARCH_GENERIC; } @@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) { else { gotoblas = get_coretype(); + if (openblas_verbose() >= 2) { + snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n", + getauxval(AT_HWCAP)); + openblas_warning(2, coremsg); + } } if (gotoblas == NULL) @@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) { } if (gotoblas && gotoblas->init) { - strncpy(coren, gotoblas_corename(), 20); - sprintf(coremsg, "Core: %s\n", coren); - openblas_warning(2, coremsg); + if (openblas_verbose() >= 2) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + } gotoblas->init(); } else { diff --git a/driver/others/memory.c b/driver/others/memory.c index 0185fa683..bd0553ca9 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -246,6 +246,14 @@ int get_num_procs(void) { #endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + +#if defined(USE_OPENMP) +#if _OPENMP >= 201511 + nums = omp_get_num_places(); +#endif + return nums; +#endif + #if !defined(OS_LINUX) return nums; #endif @@ -1806,10 +1814,19 @@ int get_num_procs(void) { #endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + +#if defined(USE_OPENMP) +/* if (omp_get_proc_bind() != omp_proc_bind_false) */ +#if _OPENMP >= 201511 + nums = omp_get_num_places(); +#endif + return nums; +#endif + #if !defined(OS_LINUX) return nums; #endif - + #if !defined(__GLIBC_PREREQ) return nums; #else @@ -2854,32 +2871,28 @@ void *blas_memory_alloc(int procpos){ position ++; } while (position < NUM_BUFFERS); -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif + if (memory_overflowed) { -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - do { - RMB; + + do { + RMB; #if defined(USE_OPENMP) - if (!newmemory[position-NUM_BUFFERS].used) { - blas_lock(&newmemory[position-NUM_BUFFERS].lock); + if (!newmemory[position-NUM_BUFFERS].used) { + blas_lock(&newmemory[position-NUM_BUFFERS].lock); #endif - if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; + if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; #if defined(USE_OPENMP) - blas_unlock(&newmemory[position-NUM_BUFFERS].lock); - } + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); + } #endif - position ++; + position ++; - } while (position < 512+NUM_BUFFERS); + } while (position < 512+NUM_BUFFERS); + } #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif -} goto error; allocation : @@ -2904,7 +2917,7 @@ void *blas_memory_alloc(int procpos){ func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -2984,6 +2997,9 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif if (memory_overflowed) goto terminate; fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); memory_overflowed=1; @@ -2997,7 +3013,6 @@ void *blas_memory_alloc(int procpos){ newmemory[i].used = 0; newmemory[i].lock = 0; } - newmemory[position-NUM_BUFFERS].used = 1; allocation2: newmemory[position-NUM_BUFFERS].used = 1; @@ -3015,7 +3030,7 @@ allocation2: func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -3069,6 +3084,9 @@ allocation2: return (void *)newmemory[position-NUM_BUFFERS].addr; terminate: +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 791e5dc27..0d5c6aec0 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -183,7 +183,7 @@ int get_L2_size(void){ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ - defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) + defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -269,7 +269,7 @@ void blas_set_parameter(void){ int factor; #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ - defined(SKYLAKEX) || defined(COOPERLAKE) + defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 094feaadd..6063a2a1d 100644 --- a/getarch.c +++ b/getarch.c @@ -469,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +#ifdef FORCE_SAPPHIRERAPIDS +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#ifdef NO_AVX512 +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#endif +#else +#define SUBARCHITECTURE "SAPPHIRERAPIDS" +#define ARCHCONFIG "-DSAPPHIRERAPIDS " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids" +#define LIBNAME "sapphirerapids" +#define CORENAME "SAPPHIRERAPIDS" +#endif +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL @@ -964,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DP5600 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "p5600" #define CORENAME "P5600" #else @@ -978,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS1004K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "mips1004K" #define CORENAME "MIPS1004K" #else @@ -992,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS24K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "mips24K" #define CORENAME "MIPS24K" #else @@ -1149,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_ARMV8SVE +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8SVE" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8SVE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" +#define LIBNAME "armv8sve" +#define CORENAME "ARMV8SVE" +#endif + #ifdef FORCE_ARMV8 #define FORCE @@ -1375,6 +1438,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "VORTEX" #endif +#ifdef FORCE_A64FX +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "A64FX" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DA64FX " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" +#define LIBNAME "a64fx" +#define CORENAME "A64FX" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" diff --git a/interface/trsv.c b/interface/trsv.c index 6a6e8f8ba..a054d8eeb 100644 --- a/interface/trsv.c +++ b/interface/trsv.c @@ -188,12 +188,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; - if (incx == 1 && trans == 0 && n < 50) { - buffer = NULL; - (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); - return; - } - IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/zrot.c b/interface/zrot.c index 1c45f685b..228c5ee45 100644 --- a/interface/zrot.c +++ b/interface/zrot.c @@ -42,14 +42,20 @@ #include "functable.h" #endif +#ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ - BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT c = *C; FLOAT s = *S; +#else +void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) { + FLOAT *x = (FLOAT*) VX; + FLOAT *y = (FLOAT*) VY; +#endif /* CBLAS */ + PRINT_DEBUG_NAME; if (n <= 0) return; diff --git a/interface/zrotg.c b/interface/zrotg.c index bc4f06492..123f4da85 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -4,8 +4,16 @@ #include "functable.h" #endif +#ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ +#else +void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { + FLOAT *DA = (FLOAT*) VDA; + FLOAT *DB = (FLOAT*) VDB; + FLOAT *S = (FLOAT*) VS; +#endif /* CBLAS */ + #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da_r = *(DA + 0); diff --git a/interface/ztrsv.c b/interface/ztrsv.c index cf750b0b0..cbb7bba13 100644 --- a/interface/ztrsv.c +++ b/interface/ztrsv.c @@ -199,12 +199,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; - if (incx == 1 && trans == 0 && n < 50) { - buffer = NULL; - (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); - return; - } - IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9ffbd944f..9849ddc93 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (${DYNAMIC_ARCH}) include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") endif () + ParseMakefileVars("${KERNELDIR}/KERNEL") + ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") SetDefaultL1() SetDefaultL2() SetDefaultL3() - ParseMakefileVars("${KERNELDIR}/KERNEL") - ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h) if(NOT NO_LAPACK) @@ -198,7 +198,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) - if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) + if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) set(USE_TRMM true) endif () if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) @@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) # symm for s and d +if (NOT DEFINED ${float_char}SYMMUCOPY_M) + set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c") + set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") + set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") +endif() GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}TRMMUNCOPY_M) + set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") + set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") + set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") + set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") +endif () + GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) @@ -578,11 +596,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) endif () if (BUILD_BFLOAT16) if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) @@ -616,11 +634,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") - GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") - GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") endif () endif () diff --git a/kernel/Makefile b/kernel/Makefile index 1a6c9413f..cbe4cde6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -31,7 +31,22 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE -ifeq ($(TARGET_CORE), COOPERLAKE) +ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + ifeq ($(GCCVERSIONGTEQ10), 1) + override CFLAGS += -march=sapphirerapids + else + override CFLAGS += -march=skylake-avx512 -mavx512f + endif + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif +else ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2d274d33b..d22bd46a5 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -47,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE) USE_TRMM = 1 endif +ifeq ($(CORE), SAPPHIRERAPIDS) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif @@ -1479,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ +ifdef STRMMUNCOPY_M +$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef STRMMLNCOPY_M +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef STRMMUTCOPY_M +$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef STRMMLTCOPY_M +$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1527,29 +1563,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef DTRMMUNCOPY_M +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef DTRMMLNCOPY_M +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef DTRMMUTCOPY_M +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef DTRMMLTCOPY_M +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1773,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N). $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef SSYMMUCOPY_M +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef SSYMMLCOPY_M +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1785,11 +1863,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef DSYMMUCOPY_M +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef DSYMMLCOPY_M +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX new file mode 100644 index 000000000..80be4ddd0 --- /dev/null +++ b/kernel/arm64/KERNEL.A64FX @@ -0,0 +1,183 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE new file mode 100644 index 000000000..0364a929c --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -0,0 +1,183 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index db322dd0d..e2e006770 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) @@ -169,7 +169,7 @@ endif DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c @@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index db322dd0d..e2e006770 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) @@ -169,7 +169,7 @@ endif DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c @@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c diff --git a/kernel/arm64/KERNEL.VORTEX b/kernel/arm64/KERNEL.VORTEX index e3efef1f5..46a34469c 100644 --- a/kernel/arm64/KERNEL.VORTEX +++ b/kernel/arm64/KERNEL.VORTEX @@ -1 +1 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +include $(KERNELDIR)/KERNEL.NEOVERSEN1 diff --git a/kernel/arm64/cgemm_kernel_8x4_cortexa53.c b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c new file mode 100644 index 000000000..f9cd97852 --- /dev/null +++ b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c @@ -0,0 +1,898 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmla " +#define FMLA_II "fmls " +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMLA_RI "fmls " +#define FMLA_IR "fmla " +#define FMLA_II "fmla " +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmls " +#define FMLA_II "fmla " +#else +#define FMLA_RI "fmls " +#define FMLA_IR "fmls " +#define FMLA_II "fmls " +#endif +#define FMLA_RR "fmla " + +static inline void store_m8n1_contracted(float *C, + float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i, + float alphar, float alphai) { + + float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8); + ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar); + ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar); + ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai); + ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai); + ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai); + ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai); + ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar); + ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar); + vst2q_f32(C, ld1); + vst2q_f32(C + 8, ld2); +} + +static inline void kernel_8x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + const float *c_pref = C; + float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i; + float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i; + + /** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */ + /** v0-v1 and v10-v11 for B, v2-v9 for A */ + __asm__ __volatile__( + "cmp %[K],#0; mov %[c_pref],%[C]\n\t" + "movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c6i].16b,#0\n\t" + "movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t" + "movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" + "beq 4f\n\t" + "cmp %[K],#2\n\t" + "ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t" + "ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t" + "mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t" + "bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t" + "bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t" + + "blt 3f; beq 2f\n\t" + "1:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t" + FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" + FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" + "fmov v7.d[1],x0; fmov d10,x5\n\t" + FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t" + FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" + "fmov v10.d[1],x6; fmov d11,x2\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t" + FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t" + "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" + FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t" + FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" + "fmov v9.d[1],x0; fmov d0,x5\n\t" + FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" + FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + "fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t" + FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" + "fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t" + FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" + FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t" + FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t" + FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t" + FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t" + FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t" + FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t" + FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t" + FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t" + FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t" + FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" + FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" + FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" + FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" + FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" + FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" + FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" + FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" + FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" + FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" + FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" + "bgt 1b; blt 3f\n\t" + "2:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t" + FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" + FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" + "fmov v7.d[1],x0; fmov d10,x5\n\t" + FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" + FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" + "fmov v10.d[1],x6; fmov d11,x2\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" + FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" + "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" + FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t" + FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" + "fmov v9.d[1],x0\n\t" + FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" + FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t" + FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t" + FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t" + FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t" + FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" + FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" + FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" + FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" + FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" + FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" + FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" + FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" + FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" + FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" + FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" + "b 4f\n\t" + "3:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + "4:\n\t" + "mov %[c_pref],%[C]\n\t" + "zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip2 %[c2i].2d,v4.2d,v5.2d\n\t" + "zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t" + "zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t" + "zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t" + "zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t" + "zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t" + "zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t" + "zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t" + "zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t" + "zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t" + "zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t" + :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), + [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), + [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), + [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), + [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref) + :[C]"r"(C), [LDC]"r"(LDC) + :"cc","memory","x0","x1","x2","x3","x4","x5","x6", + "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"); + + store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai); +} + +static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc, + float32x4_t a, float32x4_t b) { + + acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0); + acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1); + acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2); + acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3); + return acc; +} + +static inline float32x4x4_t expand_alpha(float alphar, float alphai) { + float32x4x4_t ret; + const float maskp[] = { -1, 1, -1, 1 }; + const float maskn[] = { 1, -1, 1, -1 }; + const float32x4_t vrevp = vld1q_f32(maskp); + const float32x4_t vrevn = vld1q_f32(maskn); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ret.val[0] = vdupq_n_f32(alphar); + ret.val[1] = vdupq_n_f32(-alphai); + ret.val[2] = vmulq_f32(ret.val[1], vrevn); + ret.val[3] = vmulq_f32(ret.val[0], vrevp); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + ret.val[0] = vdupq_n_f32(alphar); + ret.val[1] = vdupq_n_f32(alphai); + ret.val[2] = vmulq_f32(ret.val[1], vrevp); + ret.val[3] = vmulq_f32(ret.val[0], vrevn); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + ret.val[2] = vdupq_n_f32(alphai); + ret.val[3] = vdupq_n_f32(alphar); + ret.val[0] = vmulq_f32(ret.val[3], vrevn); + ret.val[1] = vmulq_f32(ret.val[2], vrevp); +#else + ret.val[2] = vdupq_n_f32(alphai); + ret.val[3] = vdupq_n_f32(-alphar); + ret.val[0] = vmulq_f32(ret.val[3], vrevp); + ret.val[1] = vmulq_f32(ret.val[2], vrevn); +#endif + return ret; +} + +static inline void store_expanded_m2n2(float *C, BLASLONG LDC, + float32x4x4_t acc, float32x4x4_t expanded_alpha) { + + float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); + ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]); + acc.val[0] = vrev64q_f32(acc.val[0]); + acc.val[2] = vrev64q_f32(acc.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); + ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]); + acc.val[1] = vrev64q_f32(acc.val[1]); + acc.val[3] = vrev64q_f32(acc.val[3]); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); + ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]); + vst1q_f32(C, ld1); + vst1q_f32(C + LDC * 2, ld2); +} + +static inline float32x4x4_t init_expanded_m2n2() { + float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0), vdupq_n_f32(0) }}; + return ret; +} + +static inline void kernel_4x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4), + b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a1, b2); + c4 = acc_expanded_m2n2(c4, a2, b2); + c1 = acc_expanded_m2n2(c1, a3, b3); + c2 = acc_expanded_m2n2(c2, a4, b3); + c3 = acc_expanded_m2n2(c3, a3, b4); + c4 = acc_expanded_m2n2(c4, a4, b4); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a1, b2); + c4 = acc_expanded_m2n2(c4, a2, b2); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); + C += LDC * 4; + store_expanded_m2n2(C, LDC, c3, e_alpha); + store_expanded_m2n2(C + 4, LDC, c4, e_alpha); +} + +static inline void kernel_8x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20); + float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a3, b1); + c4 = acc_expanded_m2n2(c4, a4, b1); + c1 = acc_expanded_m2n2(c1, a5, b2); + c2 = acc_expanded_m2n2(c2, a6, b2); + c3 = acc_expanded_m2n2(c3, a7, b2); + c4 = acc_expanded_m2n2(c4, a8, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a3, b1); + c4 = acc_expanded_m2n2(c4, a4, b1); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); + store_expanded_m2n2(C + 8, LDC, c3, e_alpha); + store_expanded_m2n2(C + 12, LDC, c4, e_alpha); +} + +static inline void kernel_4x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c1 = acc_expanded_m2n2(c1, a3, b2); + c2 = acc_expanded_m2n2(c2, a4, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); +} + +static inline void kernel_2x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a1, b2); + c1 = acc_expanded_m2n2(c1, a2, b3); + c2 = acc_expanded_m2n2(c2, a2, b4); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa); + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a1, b2); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha); +} + +static inline void kernel_2x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b2); + } + c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); + c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]); + c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]); + if (K) { + float32x4_t a1 = vld1q_f32(sa); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + } + + store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai)); +} + +static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc, + float32x4_t a, float32x2_t b) { + + acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0); + acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1); + return acc; +} + +static inline void store_expanded_m2n1(float *C, + float32x4x2_t acc, float32x4x4_t expanded_alpha) { + + float32x4_t ld1 = vld1q_f32(C); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); + acc.val[0] = vrev64q_f32(acc.val[0]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); + acc.val[1] = vrev64q_f32(acc.val[1]); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); + vst1q_f32(C, ld1); +} + +static inline float32x4x2_t init_expanded_m2n1() { + float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }}; + return ret; +} + +static inline void kernel_8x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12), + a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20), + a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b1); + c4 = acc_expanded_m2n1(c4, a4, b1); + c1 = acc_expanded_m2n1(c1, a5, b2); + c2 = acc_expanded_m2n1(c2, a6, b2); + c3 = acc_expanded_m2n1(c3, a7, b2); + c4 = acc_expanded_m2n1(c4, a8, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x2_t b1 = vld1_f32(sb); + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b1); + c4 = acc_expanded_m2n1(c4, a4, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); + store_expanded_m2n1(C + 4, c2, expanded_alpha); + store_expanded_m2n1(C + 8, c3, expanded_alpha); + store_expanded_m2n1(C + 12, c4, expanded_alpha); +} + +static inline void kernel_4x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b2); + c4 = acc_expanded_m2n1(c4, a4, b2); + } + c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); + c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); + c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x2_t b1 = vld1_f32(sb); + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); + store_expanded_m2n1(C + 4, c2, expanded_alpha); +} + +static inline void kernel_2x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 3; K -= 4) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2), + b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b2); + c3 = acc_expanded_m2n1(c3, a3, b3); + c4 = acc_expanded_m2n1(c4, a4, b4); + } + c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); + c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); + c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); + c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); + for (; K; K--) { + float32x4_t a1 = vld1q_f32(sa); sa += 4; + float32x2_t b1 = vld1_f32(sb); sb += 2; + c1 = acc_expanded_m2n1(c1, a1, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); +} + +static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) { + float32x2x4_t ret; + const float maskp[] = { -1, 1 }; + const float maskn[] = { 1, -1 }; + const float32x2_t vrevp = vld1_f32(maskp); + const float32x2_t vrevn = vld1_f32(maskn); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ret.val[0] = vdup_n_f32(alphar); + ret.val[1] = vdup_n_f32(-alphai); + ret.val[2] = vmul_f32(ret.val[1], vrevn); + ret.val[3] = vmul_f32(ret.val[0], vrevp); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + ret.val[0] = vdup_n_f32(alphar); + ret.val[1] = vdup_n_f32(alphai); + ret.val[2] = vmul_f32(ret.val[1], vrevp); + ret.val[3] = vmul_f32(ret.val[0], vrevn); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + ret.val[2] = vdup_n_f32(alphai); + ret.val[3] = vdup_n_f32(alphar); + ret.val[0] = vmul_f32(ret.val[3], vrevn); + ret.val[1] = vmul_f32(ret.val[2], vrevp); +#else + ret.val[2] = vdup_n_f32(alphai); + ret.val[3] = vdup_n_f32(-alphar); + ret.val[0] = vmul_f32(ret.val[3], vrevp); + ret.val[1] = vmul_f32(ret.val[2], vrevn); +#endif + return ret; +} + +static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc, + float32x2_t a, float32x2_t b) { + + acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0); + acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1); + return acc; +} + +static inline void store_expanded_m1n1(float *C, + float32x2x2_t acc, float32x2x4_t expanded_alpha) { + + float32x2_t ld1 = vld1_f32(C); + ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]); + acc.val[0] = vrev64_f32(acc.val[0]); + ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]); + acc.val[1] = vrev64_f32(acc.val[1]); + ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]); + vst1_f32(C, ld1); +} + +static inline float32x2x2_t init_expanded_m1n1() { + float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }}; + return ret; +} + +static inline void kernel_1x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K; K--) { + float32x2_t a1 = vld1_f32(sa); sa += 2; + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6)); + sb += 8; + } + + float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); + store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c4, expanded_alpha); +} + +static inline void kernel_1x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K > 1; K -= 2) { + float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4; + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6)); + sb += 8; + } + c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); + c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); + c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); + if (K) { + float32x2_t a1 = vld1_f32(sa); + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + } + + float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); + store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c2, expanded_alpha); +} + +static inline void kernel_1x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K > 3; K -= 4) { + c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6)); + sa += 8; sb += 8; + } + c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); + c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); + c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); + c1.val[0] = vadd_f32(c1.val[0], c2.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c2.val[1]); + for (; K; K--) { + c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); + sa += 2; sb += 2; + } + + store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai)); +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + BLASLONG n_left = N; + for (; n_left >= 8; n_left -= 8) { + const FLOAT *a_ = sa; + FLOAT *c1_ = C; + FLOAT *c2_ = C + LDC * 8; + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 8; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 16 * K; + c1_ += 16; + c2_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 8 * K; + c1_ += 8; + c2_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 4 * K; + c1_ += 4; + c2_ += 4; + } + if (m_left) { + kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC); + } + C += 16 * LDC; + sb += 16 * K; + } + + if (n_left >= 4) { + n_left -= 4; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 16 * K; + c_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC); + } + C += 8 * LDC; + sb += 8 * K; + } + + if (n_left >= 2) { + n_left -= 2; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 16 * K; + c_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC); + } + C += 4 * LDC; + sb += 4 * K; + } + + if (n_left) { + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x1(sa, sb, C, alphar, alphai, K); + sa += 16 * K; + C += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x1(sa, sb, C, alphar, alphai, K); + sa += 8 * K; + C += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x1(sa, sb, C, alphar, alphai, K); + sa += 4 * K; + C += 4; + } + if (m_left) { + kernel_1x1(sa, sb, C, alphar, alphai, K); + } + } + return 0; +} + diff --git a/kernel/arm64/dgemm_kernel_4x4_cortexa53.c b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c new file mode 100644 index 000000000..5a9d284df --- /dev/null +++ b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c @@ -0,0 +1,890 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +/********************************************************** + * Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12 + * Operation: C[4][12] += alpha * sa[4][K] * sb[K][12] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: 3 concatenated row-major 4-column submatrices + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + /** prefetch 4x12 elements from matrix C for RW purpose */ + __asm__ __volatile__( + "mov x0,%[C]\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t" + ::[C]"r"(C), [LDC]"r"(LDC):"x0"); + + /** 3 pointers to 3 submatrices of sb respectively */ + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 4; + const FLOAT *b3_ = sb + K * 8; + + /** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */ + /** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */ + /** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */ + /** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */ + /** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */ + + __asm__ __volatile__( + "cmp %[K],#0\n\t" + /** fill registers holding elements of C with 0.0 */ + "movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t" + "movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t" + "movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t" + "movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t" + "movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t" + "movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t" + "beq 4f; cmp %[K],#2\n\t" + /** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */ + "ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t" + "ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t" + "blt 3f; beq 2f\n\t" + "1:\n\t" + /** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" + "fmla v12.2d,v0.2d,v5.d[0]\n\t" + "fmla v13.2d,v1.2d,v5.d[0]\n\t" + "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" + "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" + "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" + "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + "ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t" + "fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t" + "fmla v9.2d,v3.2d,v6.d[0]\n\t" + "fmla v10.2d,v2.2d,v6.d[1]\n\t" + "ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t" + "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" + "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" + "fmla v13.2d,v3.2d,v7.d[0]\n\t" + "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" + "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" + "fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t" + "fmla v16.2d,v2.2d,v4.d[0]\n\t" + "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" + "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" + "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" + "fmla v19.2d,v3.2d,v4.d[1]\n\t" + "ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t" + "fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t" + "fmla v21.2d,v3.2d,v5.d[0]\n\t" + "fmla v22.2d,v2.2d,v5.d[1]\n\t" + "ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t" + "fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t" + "fmla v24.2d,v2.2d,v6.d[0]\n\t" + "fmla v25.2d,v3.2d,v6.d[0]\n\t" + "ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t" + "fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t" + "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" + "fmla v28.2d,v2.2d,v7.d[0]\n\t" + "ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t" + "fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t" + "fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t" + "fmla v31.2d,v3.2d,v7.d[1]\n\t" + "bgt 1b; blt 3f\n\t" + "2:\n\t" + /** tail part with k = 2 */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" + "fmla v12.2d,v0.2d,v5.d[0]\n\t" + "fmla v13.2d,v1.2d,v5.d[0]\n\t" + "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" + "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" + "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" + "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + "fmov v4.d[1],x0\n\t" + "fmla v8.2d,v2.2d,v6.d[0]\n\t" + "fmla v9.2d,v3.2d,v6.d[0]\n\t" + "fmla v10.2d,v2.2d,v6.d[1]\n\t" + "ldr d5,[%[b2_],#48]\n\t" + "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" + "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" + "fmla v13.2d,v3.2d,v7.d[0]\n\t" + "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" + "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" + "fmla v15.2d,v3.2d,v7.d[1]\n\t" + "fmla v16.2d,v2.2d,v4.d[0]\n\t" + "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" + "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" + "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" + "fmla v19.2d,v3.2d,v4.d[1]\n\t" + "fmov v7.d[1],x0\n\t" + "fmla v20.2d,v2.2d,v5.d[0]\n\t" + "fmla v21.2d,v3.2d,v5.d[0]\n\t" + "fmla v22.2d,v2.2d,v5.d[1]\n\t" + "fmla v23.2d,v3.2d,v5.d[1]\n\t" + "fmla v24.2d,v2.2d,v6.d[0]\n\t" + "fmla v25.2d,v3.2d,v6.d[0]\n\t" + "fmla v26.2d,v2.2d,v6.d[1]\n\t" + "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" + "fmla v28.2d,v2.2d,v7.d[0]\n\t" + "fmla v29.2d,v3.2d,v7.d[0]\n\t" + "fmla v30.2d,v2.2d,v7.d[1]\n\t" + "fmla v31.2d,v3.2d,v7.d[1]\n\t" + "b 4f\n\t" + "3:\n\t" + /** tail part with k = 1 */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t" + "fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t" + "fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t" + "ldr d4,[%[b3_]]\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "fmla v23.2d,v1.2d,v7.d[1]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "fmla v26.2d,v0.2d,v4.d[1]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "fmla v29.2d,v1.2d,v5.d[0]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + /** store 4x12 elements to C */ + "4:\n\t" + "ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t" + "fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t" + "fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t" + "fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t" + "fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t" + "fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t" + "fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t" + :[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K) + :[LDC]"r"(LDC), [alpha]"m"(alpha) + :"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +/********************************************************** + * Operation: + C[0] += alpha * up[0]; C[1] += alpha * up[1]; + C[2] += alpha * down[0]; C[3] += alpha * down[1]; + *********************************************************/ +static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) { + float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2); + t1 = vfmaq_n_f64(t1, up, alpha); + t2 = vfmaq_n_f64(t2, down, alpha); + vst1q_f64(C, t1); + vst1q_f64(C + 2, t2); +} + +/********************************************************** + * Function: dgemm_kernel_arm64_4x4_m4n8 + * Operation: C[4][8] += alpha * sa[4][K] * sb[K][8] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: 2 concatenated row-major 4-column submatrices + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm64_4x4_m4n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 4; + + /** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */ + float64x2_t c11, c12, c13, c14, c15, c16, c17, c18; + float64x2_t c21, c22, c23, c24, c25, c26, c27, c28; + c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0); + c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); + float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; + + float64x2_t b1 = vld1q_f64(b1_); + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c21 = vfmaq_laneq_f64(c21, a2, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c22 = vfmaq_laneq_f64(c22, a2, b1, 1); + + float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c23 = vfmaq_laneq_f64(c23, a2, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + c24 = vfmaq_laneq_f64(c24, a2, b2, 1); + + float64x2_t b3 = vld1q_f64(b2_); + c15 = vfmaq_laneq_f64(c15, a1, b3, 0); + c25 = vfmaq_laneq_f64(c25, a2, b3, 0); + c16 = vfmaq_laneq_f64(c16, a1, b3, 1); + c26 = vfmaq_laneq_f64(c26, a2, b3, 1); + + float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4; + c17 = vfmaq_laneq_f64(c17, a1, b4, 0); + c27 = vfmaq_laneq_f64(c27, a2, b4, 0); + c18 = vfmaq_laneq_f64(c18, a1, b4, 1); + c28 = vfmaq_laneq_f64(c28, a2, b4, 1); + } + + dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; + dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; + dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; + dgemm_store_m4n1(C, c14, c24, alpha); C += LDC; + dgemm_store_m4n1(C, c15, c25, alpha); C += LDC; + dgemm_store_m4n1(C, c16, c26, alpha); C += LDC; + dgemm_store_m4n1(C, c17, c27, alpha); C += LDC; + dgemm_store_m4n1(C, c18, c28, alpha); +} + +/********************************************************** + * Function: dgemm_kernel_arm64_4x4_m4n4 + * Operation: C[4][4] += alpha * sa[4][K] * sb[K][4] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: row-major (leading dimension == 4) + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm64_4x4_m4n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11, c21, c12, c22, c13, c23, c14, c24; + c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); + float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb); + float64x2_t b2 = vld1q_f64(sb + 2); sb += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c21 = vfmaq_laneq_f64(c21, a2, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c22 = vfmaq_laneq_f64(c22, a2, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c23 = vfmaq_laneq_f64(c23, a2, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + c24 = vfmaq_laneq_f64(c24, a2, b2, 1); + } + + dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; + dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; + dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; + dgemm_store_m4n1(C, c14, c24, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m4n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2; + c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2), + a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8; + c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0); + c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1); + c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1); + c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0); + c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0); + c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1); + c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1); + } + c11_1 = vaddq_f64(c11_1, c11_2); + c21_1 = vaddq_f64(c21_1, c21_2); + c12_1 = vaddq_f64(c12_1, c12_2); + c22_1 = vaddq_f64(c22_1, c22_2); + if (K) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0); + c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1); + c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1); + } + + dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC; + dgemm_store_m4n1(C, c12_1, c22_1, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m4n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11_1, c11_2, c21_1, c21_2; + c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0); + c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1); + c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1); + sa += 8; + } + c11_1 = vaddq_f64(c11_1, c11_2); + c21_1 = vaddq_f64(c21_1, c21_2); + if (K) { + double b1 = *sb++; + c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1); + c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1); + sa += 4; + } + + dgemm_store_m4n1(C, c11_1, c21_1, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m2n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24; + c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = + c21 = c22 = c23 = c24 = vdupq_n_f64(0); + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + 4 * K; + const FLOAT *b3_ = b2_ + 4 * K; + + for (; K; K--) { + const float64x2_t a1 = vld1q_f64(sa); sa += 2; + + float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c01 = vfmaq_laneq_f64(c01, a1, b1, 0); + c02 = vfmaq_laneq_f64(c02, a1, b1, 1); + c03 = vfmaq_laneq_f64(c03, a1, b2, 0); + c04 = vfmaq_laneq_f64(c04, a1, b2, 1); + + b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + + b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4; + c21 = vfmaq_laneq_f64(c21, a1, b1, 0); + c22 = vfmaq_laneq_f64(c22, a1, b1, 1); + c23 = vfmaq_laneq_f64(c23, a1, b2, 0); + c24 = vfmaq_laneq_f64(c24, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c01, c02, c03, c04, c11, c12, c13, c14; + c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0); + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + 4 * K; + + for (; K; K--) { + const float64x2_t a1 = vld1q_f64(sa); sa += 2; + + float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c01 = vfmaq_laneq_f64(c01, a1, b1, 0); + c02 = vfmaq_laneq_f64(c02, a1, b1, 1); + c03 = vfmaq_laneq_f64(c03, a1, b2, 0); + c04 = vfmaq_laneq_f64(c04, a1, b2, 1); + + b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2; + c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2); + float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8; + + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1); + c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0); + c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1); + + c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0); + c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1); + c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0); + c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1); + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + c3_1 = vaddq_f64(c3_1, c3_2); + c4_1 = vaddq_f64(c4_1, c4_2); + if (K) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0); + c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2; + c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0); + c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1); + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + if (K) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 3; K -= 4) { + float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4; + c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0); + c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1); + c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0); + c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1); + sa += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + for (; K; K--) { + c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++); + sa += 2; + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha)); +} + +static inline void dgemm_store_m1n2(double *C, float64x2_t vc, + double alpha, BLASLONG LDC) { + double c0 = vgetq_lane_f64(vc, 0); + double c1 = vgetq_lane_f64(vc, 1); + C[0] += c0 * alpha; + C[LDC] += c1 * alpha; +} + +static inline void dgemm_kernel_arm64_4x4_m1n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4, c5, c6; + c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0); + + const double *b1_ = sb; + const double *b2_ = sb + 4 * K; + const double *b3_ = b2_ + 4 * K; + + for (; K; K--) { + const double a1 = *sa++; + c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); + c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; + c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); + c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; + c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1); + c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c6, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + const double *b1_ = sb; + const double *b2_ = sb + 4 * K; + + for (; K; K--) { + const double a1 = *sa++; + c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); + c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; + c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); + c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c4, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2; + c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0); + c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1); + c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8; + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + if (K) { + double a1 = *sa++; + c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1); + c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1); + sb += 4; + } + + dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2_1, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 3; K -= 4) { + float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4; + c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0); + c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1); + c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0); + c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + for (; K; K--) { + c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++); + sb += 2; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 7; K -= 8) { + c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa)); + c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2)); + c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4)); + c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6)); + sa += 8; sb += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + double cs1 = vpaddd_f64(c1); + for (; K; K--) { + cs1 += (*sa++) * (*sb++); + } + + C[0] += cs1 * alpha; +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + for (; N >= 12; N -= 12) { + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha); + } + sb += 12 * K; + C += 12 * LDC; + } + + if (N >= 8) { + N -= 8; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha); + } + sb += 8 * K; + C += 8 * LDC; + } else if (N >= 4) { + N -= 4; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha); + } + sb += 4 * K; + C += 4 * LDC; + } + + if (N >= 2) { + N -= 2; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha); + } + sb += 2 * K; + C += 2 * LDC; + } + + if (N) { + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha); + } + } + return 0; +} + diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S new file mode 100644 index 000000000..bbbd0fd95 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha x17 + +#define alpha0 d10 +#define alphaZ z2.d + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S new file mode 100644 index 000000000..023d5ba92 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -0,0 +1,1683 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha x18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 d10 +#define alphaZ z7.d + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 pA20_0 +//v03 pA20_1 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 + dup z24.d, #0 + dup z25.d, #0 + dup z26.d, #0 + dup z27.d, #0 + dup z28.d, #0 + dup z29.d, #0 + dup z30.d, #0 + dup z31.d, #0 +.endm + +.macro KERNELv2x8_I + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + ld1d z2.d, p0/z, [pA1, vec_len, lsl #3] + ld1d z3.d, p0/z, [pA2, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + add pA2, pA2, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M1 + ld1d z2.d, p0/z, [pA1] + ld1d z3.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M2 + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_E + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d +.endm + +.macro KERNELv2x8_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + fmla z24.d, p0/m, z0.d, z12.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z25.d, p0/m, z1.d, z12.d + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d +.endm + +.macro SAVEv2x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z8.d, p0/z, [pCRow2] + ld1d z9.d, p0/z, [pCRow2, #1, mul vl] + fmla z8.d, p0/m, z24.d, alphaZ + fmla z9.d, p0/m, z25.d, alphaZ + st1d z8.d, p0, [pCRow2] + st1d z9.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z26.d, alphaZ + fmla z11.d, p0/m, z27.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z28.d, alphaZ + fmla z13.d, p0/m, z29.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z30.d, alphaZ + fmla z15.d, p0/m, z31.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv2x4_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d +.endm + +.macro SAVEv2x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv2x2_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] +.endm + +.macro SAVEv2x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 +.endm + +.macro INITv2x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv2x1_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] +.endm + +.macro SAVEv2x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA1] + ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one + add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + cntd vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN + blt .Ldgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + prfm PLDL1KEEP, [pA2] + + .align 5 +.Ldgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv2_22a + + .align 5 +.Ldgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv2_22 + + .align 5 +.Ldgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Ldgemm_kernel_L8_Mv2_44 + + .align 5 +.Ldgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Ldgemm_kernel_L8_Mv2_44 + +.Ldgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Ldgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv2_100 + + .align 5 +.Ldgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv2_46 + +.Ldgemm_kernel_L8_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x8 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L8_END + +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. +.Ldgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv2_44 + + .align 5 +.Ldgemm_kernel_L4_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv2_22 + +.Ldgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv2_100 + + .align 5 +.Ldgemm_kernel_L4_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv2_46 + +.Ldgemm_kernel_L4_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x4 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L4_END + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. +.Ldgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv2_44 + + .align 5 +.Ldgemm_kernel_L2_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv2_22 + +.Ldgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv2_100 + + .align 5 +.Ldgemm_kernel_L2_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv2_46 + +.Ldgemm_kernel_L2_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x2 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L2_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. +.Ldgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + + .align 5 +.Ldgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv2_44 + + .align 5 +.Ldgemm_kernel_L1_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_22 + +.Ldgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv2_100 + + .align 5 +.Ldgemm_kernel_L1_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_46 + +.Ldgemm_kernel_L1_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x1 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L1_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. +.Ldgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..1f812c775 --- /dev/null +++ b/kernel/arm64/dgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..cb645a1b6 --- /dev/null +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1(pg, (double *)aoffset1); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S new file mode 100644 index 000000000..1f8c9b20f --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -0,0 +1,1008 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha x17 +//#define temp x18 +#define tempOffset x19 +#define tempK x20 +#define temp x21 + +#define alpha0 d10 +#define alphaZ z2.d + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z20.d, p1/m, z20.d, alphaZ + st1d z20.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z21.d, p1/m, z21.d, alphaZ + st1d z21.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z22.d, p1/m, z22.d, alphaZ + st1d z22.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z23.d, p1/m, z23.d, alphaZ + st1d z23.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldtrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldtrmm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldtrmm_kernel_L8_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #8 +#endif + + INITv1x8 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldtrmm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldtrmm_kernel_L8_Mv1_22a + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L8_Mv1_22 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldtrmm_kernel_L8_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldtrmm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldtrmm_kernel_L8_Mv1_44 + +.Ldtrmm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldtrmm_kernel_L8_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L8_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L8_Mv1_46 + +.Ldtrmm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #8 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L8_Mv1_20 + +.Ldtrmm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Ldtrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldtrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldtrmm_kernel_L2_BEGIN + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L4_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L4_Mv1_22 + +.Ldtrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L4_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L4_Mv1_46 + +.Ldtrmm_kernel_L4_Mv1_100: + + SAVEv1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L4_Mv1_20 + + +.Ldtrmm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldtrmm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldtrmm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L2_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + INITv1x2 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L2_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L2_Mv1_22 + +.Ldtrmm_kernel_L2_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L2_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L2_Mv1_46 + +.Ldtrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + +.Ldtrmm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L2_Mv1_20 + + +.Ldtrmm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldtrmm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L1_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + INITv1x1 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldtrmm_kernel_L1_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_22 + +.Ldtrmm_kernel_L1_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L1_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_46 + +.Ldtrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + + +.Ldtrmm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L1_Mv1_20 + + +.Ldtrmm_kernel_L1_END: + +/******************************************************************************/ + +.Ldtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_kernel_sve_v1x8.S b/kernel/arm64/sgemm_kernel_sve_v1x8.S new file mode 100644 index 000000000..88c74bc0f --- /dev/null +++ b/kernel/arm64/sgemm_kernel_sve_v1x8.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha w17 + +#define alpha0 s10 +#define alphaZ z2.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA] + ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z28.s, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaZ + st1w z28.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z29.s, p1/z, [pCRow1] + fmla z29.s, p1/m, z21.s, alphaZ + st1w z29.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z30.s, p1/z, [pCRow2] + fmla z30.s, p1/m, z22.s, alphaZ + st1w z30.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z31.s, p1/z, [pCRow1] + fmla z31.s, p1/m, z23.s, alphaZ + st1w z31.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + + lsl LDC, LDC, #2 // ldc = ldc * 4 + ptrue p0.s // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S new file mode 100644 index 000000000..1cdd8253e --- /dev/null +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -0,0 +1,1683 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/* This is an SVE sgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha w18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 s10 +#define alphaZ z7.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 pA20_0 +//v03 pA20_1 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 + dup z24.s, #0 + dup z25.s, #0 + dup z26.s, #0 + dup z27.s, #0 + dup z28.s, #0 + dup z29.s, #0 + dup z30.s, #0 + dup z31.s, #0 +.endm + +.macro KERNELv2x8_I + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + ld1w z2.s, p0/z, [pA1, vec_len, lsl #2] + ld1w z3.s, p0/z, [pA2, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 *2 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 *2 + + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M1 + ld1w z2.s, p0/z, [pA1] + ld1w z3.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + fmla z31.s, p0/m, z1.s, z15.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M2 + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 2 * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 2 * 4 + + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_E + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s +.endm + +.macro KERNELv2x8_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + fmla z24.s, p0/m, z0.s, z12.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z25.s, p0/m, z1.s, z12.s + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s +.endm + +.macro SAVEv2x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z8.s, p0/z, [pCRow2] + ld1w z9.s, p0/z, [pCRow2, #1, mul vl] + fmla z8.s, p0/m, z24.s, alphaZ + fmla z9.s, p0/m, z25.s, alphaZ + st1w z8.s, p0, [pCRow2] + st1w z9.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z26.s, alphaZ + fmla z11.s, p0/m, z27.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z28.s, alphaZ + fmla z13.s, p0/m, z29.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z30.s, alphaZ + fmla z15.s, p0/m, z31.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv2x4_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s +.endm + +.macro SAVEv2x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv2x2_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] +.endm + +.macro SAVEv2x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 +.endm + +.macro INITv2x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv2x1_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] +.endm + +.macro SAVEv2x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA1] + ld1w z1.s, p1/z, [pA1, lanes, lsl #2] // next one + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z28.s, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaZ + st1w z28.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z29.s, p1/z, [pCRow1] + fmla z29.s, p1/m, z21.s, alphaZ + st1w z29.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z30.s, p1/z, [pCRow2] + fmla z30.s, p1/m, z22.s, alphaZ + st1w z30.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z31.s, p1/z, [pCRow1] + fmla z31.s, p1/m, z23.s, alphaZ + st1w z31.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + cntw vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #2 // ldc = ldc * 8 + ptrue p0.s // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lsgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Lsgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN + blt .Lsgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + prfm PLDL1KEEP, [pA2] + + .align 5 +.Lsgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv2_22a + + .align 5 +.Lsgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv2_22 + + .align 5 +.Lsgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Lsgemm_kernel_L8_Mv2_44 + + .align 5 +.Lsgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Lsgemm_kernel_L8_Mv2_44 + +.Lsgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Lsgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv2_100 + + .align 5 +.Lsgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv2_46 + +.Lsgemm_kernel_L8_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x8 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L8_END + +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. +.Lsgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv1_22a + + .align 5 +.Lsgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv1_22 + + .align 5 +.Lsgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Lsgemm_kernel_L8_Mv1_44 + + .align 5 +.Lsgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Lsgemm_kernel_L8_Mv1_44 + +.Lsgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Lsgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv1_100 + + .align 5 +.Lsgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv1_46 + +.Lsgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Lsgemm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lsgemm_kernel_L8_Mv1_20 + +.Lsgemm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt .Lsgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Lsgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Lsgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + .align 5 +.Lsgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv2_44 + + .align 5 +.Lsgemm_kernel_L4_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv2_22 + +.Lsgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv2_100 + + .align 5 +.Lsgemm_kernel_L4_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv2_46 + +.Lsgemm_kernel_L4_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x4 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L4_END + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. +.Lsgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv1_44 + + .align 5 +.Lsgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv1_22 + +.Lsgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv1_100 + + .align 5 +.Lsgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv1_46 + +.Lsgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lsgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L4_Mv1_20 + + +.Lsgemm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Lsgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Lsgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + .align 5 +.Lsgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv2_44 + + .align 5 +.Lsgemm_kernel_L2_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv2_22 + +.Lsgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv2_100 + + .align 5 +.Lsgemm_kernel_L2_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv2_46 + +.Lsgemm_kernel_L2_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x2 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L2_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. +.Lsgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv1_44 + + .align 5 +.Lsgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv1_22 + +.Lsgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv1_100 + + .align 5 +.Lsgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv1_46 + +.Lsgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Lsgemm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L2_Mv1_20 + + +.Lsgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Lsgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lsgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + + .align 5 +.Lsgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv2_44 + + .align 5 +.Lsgemm_kernel_L1_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_22 + +.Lsgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv2_100 + + .align 5 +.Lsgemm_kernel_L1_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_46 + +.Lsgemm_kernel_L1_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x1 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L1_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. +.Lsgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv1_44 + + .align 5 +.Lsgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_22 + +.Lsgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv1_100 + + .align 5 +.Lsgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_46 + +.Lsgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Lsgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L1_Mv1_20 + + +.Lsgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..1bc186335 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_sve_v1.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint32_t lda_vec = svindex_s32(0LL, lda); + uint32_t sve_size = svcntw(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); + svst1_f32(pg, (float *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..9f8cf502a --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_sve_v1.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + uint32_t sve_size = svcntw(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec = svld1(pg, (float *) aoffset1); + svst1_f32(pg, (float *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/strmm_kernel_sve_v1x8.S b/kernel/arm64/strmm_kernel_sve_v1x8.S new file mode 100644 index 000000000..3c45e3e29 --- /dev/null +++ b/kernel/arm64/strmm_kernel_sve_v1x8.S @@ -0,0 +1,1008 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha w17 +//#define temp x18 +#define tempOffset x19 +#define tempK x20 +#define temp x21 + +#define alpha0 s10 +#define alphaZ z2.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA] + ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.s, p1/m, z18.s, alphaZ + st1w z18.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z19.s, p1/m, z19.s, alphaZ + st1w z19.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z20.s, p1/m, z20.s, alphaZ + st1w z20.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z21.s, p1/m, z21.s, alphaZ + st1w z21.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z22.s, p1/m, z22.s, alphaZ + st1w z22.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z23.s, p1/m, z23.s, alphaZ + st1w z23.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.s, p1/m, z18.s, alphaZ + st1w z18.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z19.s, p1/m, z19.s, alphaZ + st1w z19.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + + lsl LDC, LDC, #2 // ldc = ldc * 8 + ptrue p0.s // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lstrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Lstrmm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lstrmm_kernel_L8_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #8 +#endif + + INITv1x8 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lstrmm_kernel_L8_Mv1_22a + + .align 5 +.Lstrmm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_Mv1_22 + + .align 5 +.Lstrmm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Lstrmm_kernel_L8_Mv1_44 + + .align 5 +.Lstrmm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Lstrmm_kernel_L8_Mv1_44 + +.Lstrmm_kernel_L8_Mv1_40: + + INITv1x8 + +.Lstrmm_kernel_L8_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L8_Mv1_100 + + .align 5 +.Lstrmm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L8_Mv1_46 + +.Lstrmm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #8 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lstrmm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L8_Mv1_20 + +.Lstrmm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lstrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Lstrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Lstrmm_kernel_L2_BEGIN + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lstrmm_kernel_L4_Mv1_44 + + .align 5 +.Lstrmm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_Mv1_22 + +.Lstrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L4_Mv1_100 + + .align 5 +.Lstrmm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L4_Mv1_46 + +.Lstrmm_kernel_L4_Mv1_100: + + SAVEv1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lstrmm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L4_Mv1_20 + + +.Lstrmm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Lstrmm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Lstrmm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L2_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + INITv1x2 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lstrmm_kernel_L2_Mv1_44 + + .align 5 +.Lstrmm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_Mv1_22 + +.Lstrmm_kernel_L2_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L2_Mv1_100 + + .align 5 +.Lstrmm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L2_Mv1_46 + +.Lstrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + +.Lstrmm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L2_Mv1_20 + + +.Lstrmm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Lstrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lstrmm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L1_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + INITv1x1 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lstrmm_kernel_L1_Mv1_44 + + .align 5 +.Lstrmm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_Mv1_22 + +.Lstrmm_kernel_L1_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L1_Mv1_100 + + .align 5 +.Lstrmm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_Mv1_46 + +.Lstrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + + +.Lstrmm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L1_Mv1_20 + + +.Lstrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c new file mode 100644 index 000000000..6ba4afc8b --- /dev/null +++ b/kernel/arm64/symm_lcopy_sve.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c new file mode 100644 index 000000000..32da5bd16 --- /dev/null +++ b/kernel/arm64/symm_ucopy_sve.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp2, temp1); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, one_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint32_t gat_ind = svsel(cmp, temp2, temp1); + + i = m; + while (i>0) { + svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, one_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c new file mode 100644 index 000000000..918e945ac --- /dev/null +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+k*lda+j); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+k*lda+j); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c new file mode 100644 index 000000000..b76cc56de --- /dev/null +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) { + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + + return 0; +} diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c new file mode 100644 index 000000000..75fa163ae --- /dev/null +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + posY * lda; + } else { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c new file mode 100644 index 000000000..36a03242a --- /dev/null +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -0,0 +1,134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + posY * lda; + } else { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) { + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+j*lda+k); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+j*lda+k); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/zgemm_kernel_4x4_cortexa53.c b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c new file mode 100644 index 000000000..aa0f7d72d --- /dev/null +++ b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c @@ -0,0 +1,736 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +/******************************************************************************* + The complex GEMM kernels in OpenBLAS use static configuration of conjugation +modes via specific macros: + + MACRO_NAME | conjugation on matrix A | conjugation on matrix B | + ---------- | ----------------------- | ----------------------- | + NN/NT/TN/TT | No | No | + NR/NC/TR/TC | No | Yes | + RN/RT/CN/CT | Yes | No | + RR/RC/CR/CC | Yes | Yes | + + "conjugation on matrix A" means the complex conjugates of elements from +matrix A are used for matmul (rather than the original elements). "conjugation +on matrix B" means the complex conjugate of each element from matrix B is taken +for matrix multiplication, respectively. + + Complex numbers in arrays or matrices are usually packed together as an +array of struct (without padding): + struct complex_number { + FLOAT real_part; + FLOAT imag_part; + }; + + For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF +DOUBLE, the real part of its Kth complex number can be accessed as +ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1]. + + This file uses 2 ways to vectorize matrix multiplication of complex numbers: + +(1) Expanded-form + + During accumulation along direction K: + + Σk(a[0][k].real b[k][n].real) + accumulate Σk(a[0][k].imag b[k][n].real) + -------------------> . + | * b[k][n].real . + | (broadcasted) . + a[0][k].real Σk(a[v-1][k].real b[k][n].real) + a[0][k].imag Σk(a[v-1][k].imag b[k][n].real) + . VECTOR I +(vec_a) . + . + a[v-1][k].real Σk(a[0][k].real b[k][n].imag) + a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag) + | . + | accumulate . + -------------------> . + * b[k][n].imag Σk(a[v-1][k].real b[k][n].imag) + (broadcasted) Σk(a[v-1][k].imag b[k][n].imag) + VECTOR II + + After accumulation, prior to storage: + + -1 -Σk(a[0][k].imag b[k][n].imag) + 1 Σk(a[0][k].real b[k][n].imag) + . . + VECTOR II permute and multiply . to get . + . . + -1 -Σk(a[v-1][k].imag b[k][n].imag) + 1 Σk(a[v-1][k].real b[k][n].imag) + + then add with VECTOR I to get the result vector of elements of C. + + 2 vector registers are needed for every v elements of C, with +v == sizeof(vector) / sizeof(complex) + +(2) Contracted-form + + During accumulation along direction K: + + (the K coordinate is not shown, since the operation is identical for each k) + + (load vector in mem) (load vector in mem) + a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i + | | + | unzip operation (or VLD2 in arm neon) | + ----------------------------------------------------- + | + | + -------------------------------------------------- + | | + | | + v v + a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag + | | | | + | | * b[i].imag(broadcast) | | + * b[i].real | -----------------------------|---- | * b[i].real + (broadcast) | | | | (broadcast) + | ------------------------------ | | + + | - | * b[i].imag(broadcast) + | + | + v v v v + (accumulate) (accumulate) + c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag + VECTOR_REAL VECTOR_IMAG + + After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved) +then stored to matrix C directly. + + For 2v elements of C, only 2 vector registers are needed, while +4 registers are required for expanded-form. +(v == sizeof(vector) / sizeof(complex)) + + For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers +to store elements of C when using expanded-form calculation, where +the register spilling will occur. So contracted-form operation is +selected for 4x4 kernel. As for all other combinations of unroll parameters +(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more +NEON registers into usage to hide latency of multiply-add instructions. +******************************************************************************/ + +static inline float64x2_t set_f64x2(double lo, double hi) { + float64x2_t ret = vdupq_n_f64(0); + ret = vsetq_lane_f64(lo, ret, 0); + ret = vsetq_lane_f64(hi, ret, 1); + return ret; +} + +static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) { + float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }}; + return ret; +} + +/***************************************************************** + * operation: *c += alpha * c_value //complex multiplication + * expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r } + * expanded_c: {{ arbr, aibr }, { arbi, aibi }} + ****************************************************************/ +static inline void store_1c(double *c, float64x2x2_t expanded_c, + float64x2x2_t expanded_alpha) { + float64x2_t ld = vld1q_f64(c); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); + double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); + double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); + double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); +#else + double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); + double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); +#endif + ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real); + vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag)); +} + +static inline void pref_c_4(const double *c) { + __asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):); +} + +static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) { + float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]), + vaddq_f64(ec1.val[1], ec2.val[1]) }}; + return ret; +} + +static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) { + float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }}; + return ret; +} + +static inline float64x2x2_t init() { + float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }}; + return ret; +} + +static inline void kernel_1x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 3; K -= 4) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b2); + c3 = update_ec(c3, a3, b3); + c4 = update_ec(c4, a4, b4); + } + c1 = add_ec(c1, c2); + c3 = add_ec(c3, c4); + c1 = add_ec(c1, c3); + for (; K; K--) { + c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2; + } + store_1c(C, c1, expanded_alpha); +} + +static inline void kernel_2x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a3, b2); + c4 = update_ec(c4, a4, b2); + } + c1 = add_ec(c1, c3); + c2 = add_ec(c2, c4); + if (K) { + float64x2_t b1 = vld1q_f64(sb); + c1 = update_ec(c1, vld1q_f64(sa), b1); + c2 = update_ec(c2, vld1q_f64(sa + 2), b1); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); +} + +static inline void kernel_1x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a1, b2); + c3 = update_ec(c3, a2, b3); + c4 = update_ec(c4, a2, b4); + } + c1 = add_ec(c1, c3); + c2 = add_ec(c2, c4); + if (K) { + float64x2_t a1 = vld1q_f64(sa); + c1 = update_ec(c1, a1, vld1q_f64(sb)); + c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + LDC * 2, c2, expanded_alpha); +} + +static inline void kernel_2x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a1, b2); + c4 = update_ec(c4, a2, b2); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); + store_1c(C + 2, c4, expanded_alpha); +} + +static inline void kernel_4x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + pref_c_4(C); + + for (; K; K--) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c1 = update_ec(c1, vld1q_f64(sa), b1); + c2 = update_ec(c2, vld1q_f64(sa + 2), b1); + c3 = update_ec(c3, vld1q_f64(sa + 4), b1); + c4 = update_ec(c4, vld1q_f64(sa + 6), b1); + sa += 8; + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); + store_1c(C + 4, c3, expanded_alpha); + store_1c(C + 6, c4, expanded_alpha); +} + +static inline void kernel_4x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; + c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); + pref_c_4(C); + pref_c_4(C + LDC * 2); + + for (; K; K--) { + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a3, b1); + c4 = update_ec(c4, a4, b1); + c5 = update_ec(c5, a1, b2); + c6 = update_ec(c6, a2, b2); + c7 = update_ec(c7, a3, b2); + c8 = update_ec(c8, a4, b2); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); + store_1c(C + 4, c3, expanded_alpha); + store_1c(C + 6, c4, expanded_alpha); C += LDC * 2; + store_1c(C, c5, expanded_alpha); + store_1c(C + 2, c6, expanded_alpha); + store_1c(C + 4, c7, expanded_alpha); + store_1c(C + 6, c8, expanded_alpha); +} + +static inline void kernel_1x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + c1 = update_ec(c1, a1, vld1q_f64(sb)); + c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); + c3 = update_ec(c3, a1, vld1q_f64(sb + 4)); + c4 = update_ec(c4, a1, vld1q_f64(sb + 6)); + sb += 8; + } + store_1c(C, c1, expanded_alpha); C += LDC * 2; + store_1c(C, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); C += LDC * 2; + store_1c(C, c4, expanded_alpha); +} + +static inline void kernel_2x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; + c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a1, b2); + c4 = update_ec(c4, a2, b2); + c5 = update_ec(c5, a1, b3); + c6 = update_ec(c6, a2, b3); + c7 = update_ec(c7, a1, b4); + c8 = update_ec(c8, a2, b4); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); + store_1c(C + 2, c4, expanded_alpha); C += LDC * 2; + store_1c(C, c5, expanded_alpha); + store_1c(C + 2, c6, expanded_alpha); C += LDC * 2; + store_1c(C, c7, expanded_alpha); + store_1c(C + 2, c8, expanded_alpha); +} + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmla " +#define FMLA_II "fmls " +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMLA_RI "fmls " +#define FMLA_IR "fmla " +#define FMLA_II "fmla " +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmls " +#define FMLA_II "fmla " +#else +#define FMLA_RI "fmls " +#define FMLA_IR "fmls " +#define FMLA_II "fmls " +#endif +#define FMLA_RR "fmla " + +static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i, + float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) { + float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4); + up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar); + up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai); + lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar); + lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai); + up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai); + up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar); + lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai); + lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar); + vst2q_f64(C, up); + vst2q_f64(C + 4, lo); +} + +static inline void kernel_4x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + float64x2_t c1r, c1i, c2r, c2i; + float64x2_t c3r, c3i, c4r, c4i; + float64x2_t c5r, c5i, c6r, c6i; + float64x2_t c7r, c7i, c8r, c8i; + + const double *pref_ = C; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); + + __asm__ __volatile__( + "cmp %[K],#0\n\t" + "movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t" + "movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t" + "movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t" + "movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" + "beq 4f; cmp %[K],#2\n\t" + "ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t" + "ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t" + "ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t" + "beq 2f; blt 3f\n\t" + "1:\n\t" + "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + "fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + "fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" + FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" + "fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t" + FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t" + FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" + FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" + "fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t" + FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t" + FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" + FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" + "fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t" + FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t" + FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" + FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" + "fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t" + FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t" + FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" + FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t" + FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t" + FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" + FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t" + FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" + FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t" + FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" + FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t" + FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" + FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t" + FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" + FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t" + FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" + FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t" + FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" + FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" + FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" + FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t" + FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" + FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t" + "2:\n\t" + "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + "fmov v15.d[1],x0\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" + FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" + FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t" + FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" + FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" + FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t" + FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" + FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" + FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t" + FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" + FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" + FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t" + FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" + FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" + FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t" + FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" + FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t" + FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" + FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t" + FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" + FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t" + FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" + FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t" + FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" + FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t" + FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" + FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t" + FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" + FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" + FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" + FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t" + FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" + FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t" + "3:\n\t" + "fmov v7.d[1],x0\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t" + "4:\n\t" + :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), + [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), + [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), + [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), + [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb) + ::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + + store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2; + store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2; + store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2; + store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai); +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + BLASLONG n_left = N; + for (; n_left >= 4; n_left -= 4) { + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai); + } + sb += 8 * K; + C += 8 * LDC; + } + if (n_left >= 2) { + n_left -= 2; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai); + } + sb += 4 * K; + C += 4 * LDC; + } + if (n_left) { + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x1(a_, sb, c_, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x1(a_, sb, c_, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x1(a_, sb, c_, K, alphar, alphai); + } + } + return 0; +} + diff --git a/kernel/mips/KERNEL.generic b/kernel/mips/KERNEL.generic new file mode 100644 index 000000000..17f2ef976 --- /dev/null +++ b/kernel/mips/KERNEL.generic @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 873653f1e..79d889fe0 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -1,7 +1,6 @@ -ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +ifeq ($(HAVE_GAS), 1) include $(KERNELDIR)/KERNEL.POWER8 else - #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -33,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c +SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c +SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c +SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c +SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c + DGEMMKERNEL = dgemm_kernel_power10.c DGEMMINCOPY = DGEMMITCOPY = @@ -43,7 +52,18 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c + CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c @@ -218,5 +238,4 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c - endif diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 56a5ab47a..902eba82c 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, #endif const float *mvecp = mvec; /* We have to load reverse mask for big endian. */ - /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif + long ytmp; __asm__ @@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "stxv 34, 80(%4) \n\t" "stxv 39, 96(%4) \n\t" "stxv 38, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "stxv 34, 80(%4) \n\t" "stxv 39, 96(%4) \n\t" "stxv 38, 112(%4) \n\t" +#endif "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c index 6c80f9cd4..f30e1fa09 100644 --- a/kernel/power/ccopy_microk_power10.c +++ b/kernel/power/ccopy_microk_power10.c @@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) ".align 5 \n" "one%=: \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 36, 80(%3) \n\t" "stxv 39, 96(%3) \n\t" "stxv 38, 112(%3) \n\t" +#endif "lxvp 32, 0(%2) \n\t" "lxvp 34, 32(%2) \n\t" "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else "stxv 41, 128(%3) \n\t" "stxv 40, 144(%3) \n\t" "stxv 43, 160(%3) \n\t" @@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 44, 208(%3) \n\t" "stxv 47, 224(%3) \n\t" "stxv 46, 240(%3) \n\t" +#endif "lxvp 40, 128(%2) \n\t" "lxvp 42, 160(%2) \n\t" "lxvp 44, 192(%2) \n\t" @@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "bgt one%= \n" "two%=: \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 44, 208(%3) \n\t" "stxv 47, 224(%3) \n\t" "stxv 46, 240(%3) \n\t" - +#endif "#n=%1 x=%4=%2 y=%0=%3" : "=m" (*y), diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index b9e2d2ce5..c53fe0c02 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) #include "cdot_microk_power10.c" #else #ifndef HAVE_KERNEL_8 @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) BLASLONG n1 = n & -16; #else BLASLONG n1 = n & -8; diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c index 399f2b180..9d42559c9 100644 --- a/kernel/power/cdot_microk_power10.c +++ b/kernel/power/cdot_microk_power10.c @@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cdot_kernel_8 (long n, float *x, float *y, float *dot) { +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif __asm__ ( "dcbt 0, %2 \n\t" @@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot) "xxswapd 33, 34 \n\t" "xvaddsp 35, 35, 32 \n\t" "xvaddsp 34, 34, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xxpermdi 34, 35, 34, 0 \n\t" +#else "xxpermdi 34, 34, 35, 2 \n\t" +#endif "stxv 34, 0(%6) \n\t" "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S index e04f948dd..fbd22aaad 100644 --- a/kernel/power/cgemm_kernel_power10.S +++ b/kernel/power/cgemm_kernel_power10.S @@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_power10.S" +#if (_AIX) +.set perm_const1, 0x0405060700010203 +.set perm_const2, 0x0c0d0e0f08090a0b +.set save_permute_12, 0x1011121300010203 +.set save_permute_11, 0x18191a1b08090a0b +#else .equ perm_const1, 0x0405060700010203 .equ perm_const2, 0x0c0d0e0f08090a0b .equ save_permute_12, 0x0c0d0e0f1c1d1e1f .equ save_permute_11, 0x0405060714151617 - +#endif #ifndef NEEDPARAM @@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*load reverse permute mask for big endian uint128 = 0xc0d0e0f08090a0b0405060700010203 */ - +#if (_AIX) + lis T2, (perm_const2>>48 & 0xFFFF) + lis T1, (perm_const1>>48 & 0xFFFF) + lis T3, (save_permute_12>>48 & 0xFFFF) + lis T4, (save_permute_11>>48 & 0xFFFF) + + ori T2, T2, (perm_const2>>32 & 0xFFFF) + ori T1, T1, (perm_const1>>32 & 0xFFFF) + ori T3, T3, (save_permute_12>>32 & 0xFFFF) + ori T4, T4, (save_permute_11>>32 & 0xFFFF) +#else lis T2, perm_const2@highest lis T1, perm_const1@highest lis T3, save_permute_12@highest lis T4, save_permute_11@highest - ori T2, T2, perm_const2@higher ori T1, T1, perm_const1@higher ori T3, T3, save_permute_12@higher ori T4, T4, save_permute_11@higher - +#endif rldicr T2, T2, 32, 31 rldicr T1, T1, 32, 31 rldicr T3, T3, 32, 31 rldicr T4, T4, 32, 31 +#if (_AIX) + oris T2, T2, (perm_const2>>16 & 0xFFFF) + oris T1, T1, (perm_const1>>16 & 0xFFFF) + oris T3, T3, (save_permute_12>>16 & 0xFFFF) + oris T4, T4, (save_permute_11>>16 & 0xFFFF) + + ori T2, T2, (perm_const2 & 0xFFFF) + ori T1, T1, (perm_const1 & 0xFFFF) + ori T3, T3, (save_permute_12 & 0xFFFF) + ori T4, T4, (save_permute_11 & 0xFFFF) +#else oris T2, T2, perm_const2@h oris T1, T1, perm_const1@h oris T3, T3, save_permute_12@h @@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ori T1, T1, perm_const1@l ori T3, T3, save_permute_12@l ori T4, T4, save_permute_11@l - +#endif li r0,0 li PRE,512 diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S index b66e93405..f75bf5dad 100644 --- a/kernel/power/cgemm_macros_power10.S +++ b/kernel/power/cgemm_macros_power10.S @@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .endm .macro LOAD4x8_2 @@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 42, 38 + xvf32gerpp 2, 43, 38 + xvf32gerpp 1, 40, 38 + xvf32gerpp 0, 41, 38 + xvf32gerpp 7, 42, 39 + xvf32gerpp 6, 43, 39 + xvf32gerpp 5, 40, 39 + xvf32gerpp 4, 41, 39 +#else xvf32gerpp 3, 42, 39 xvf32gerpp 2, 43, 39 xvf32gerpp 1, 40, 39 @@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 43, 38 xvf32gerpp 5, 40, 38 xvf32gerpp 4, 41, 38 +#endif .if \Complete==0 lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 xvaddsp vs29, vs29, vs9 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) MULT_APLHA_PART1 vs48, vs56, vs0, vs1 @@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs32, vs32, vs3 xvaddsp vs33, vs33, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs40, vs40, vs7 xvaddsp vs41, vs41, vs5 xvaddsp vs34, vs34, vs11 xvaddsp vs35, vs35, vs9 xvaddsp vs42, vs42, vs15 xvaddsp vs43, vs43, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs33, vs0, vs8, 1 + xxpermdi vs32, vs2, vs10, 1 + xxpermdi vs41, vs4, vs12, 1 + xxpermdi vs40, vs6, vs14, 1 + xxpermdi vs35, vs8, vs0, 1 + xxpermdi vs34, vs10, vs2, 1 + xxpermdi vs43, vs12, vs4, 1 + xxpermdi vs42, vs14, vs6, 1 #else xxpermdi vs33, vs8, vs0, 2 xxpermdi vs32, vs10, vs2, 2 @@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs34, vs2, vs10, 2 xxpermdi vs43, vs4, vs12, 2 xxpermdi vs42, vs6, vs14, 2 +#endif #endif stxvp vs32, 0(T2) stxvp vs40, 32(T2) @@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .endm .macro LOAD4x4_2 @@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 39 + xvf32gerpp 2, 37, 39 + xvf32gerpp 1, 36, 38 + xvf32gerpp 0, 37, 38 +#else xvf32gerpp 3, 36, 38 xvf32gerpp 2, 37, 38 xvf32gerpp 1, 36, 39 xvf32gerpp 0, 37, 39 +#endif .if \Complete==0 lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 @@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs7, vs14, vs6, 2 xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 @@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs29, vs29, vs5 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 + xxpermdi vs29, vs4, vs12, 1 + xxpermdi vs28, vs6, vs14, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs14, vs6, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 32 xvf32gerpp 0, 35, 32 +#endif .endm .macro LOAD4x2_2 @@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 33 xvf32gerpp 0, 35, 33 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 37, 33 + xvf32gerpp 0, 36, 33 +#else xvf32gerpp 1, 36, 32 xvf32gerpp 0, 37, 32 +#endif .if \Complete==0 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs2, vs10, 0 + xxpermdi vs3, vs8, vs0, 3 + xxpermdi vs11, vs10, vs2, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs10, vs2, 0 xxpermdi vs3, vs0, vs8, 3 xxpermdi vs11, vs2, vs10, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 xvaddsp vs25, vs25, vs3 xvaddsp vs27, vs27, vs11 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs2, vs10, 0 + xxpermdi vs25, vs8, vs0, 3 + xxpermdi vs27, vs10, vs2, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs10, vs2, 0 xxpermdi vs25, vs0, vs8, 3 xxpermdi vs27, vs2, vs10, 3 +#endif #endif stxv vs24, 0(CO) stxv vs25, 0(T1) @@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .endm .macro LOAD4x1_2 @@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x1_2O OffsetA, OffsetB lxv vs32, (\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif lxvp vs34, (0+\OffsetB)(BO) lxvp vs36, (32+\OffsetB)(BO) .endm @@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 36, 33 + xvf32gerpp 1, 37, 33 +#else xvf32gerpp 0, 37, 33 xvf32gerpp 1, 36, 33 +#endif .if \Complete==0 lxv vs32, DISP2(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif .endif .if \IsLast==1 .if \Complete==1 @@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 2, 37, 35 xvf32gerpp 3, 36, 35 xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 41, 35 + xvf32gerpp 3, 40, 35 + xvf32gerpp 0, 39, 35 + xvf32gerpp 1, 38, 35 +#else xvf32gerpp 2, 41, 34 xvf32gerpp 3, 40, 34 xvf32gerpp 0, 39, 34 xvf32gerpp 1, 38, 34 +#endif .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) @@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 @@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 #else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 +#else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs12, vs4, 2 @@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 32(CO) @@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 37, 35 + xvf32gerpp 1, 36, 35 +#else xvf32gerpp 0, 37, 34 xvf32gerpp 1, 36, 34 +#endif + .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 xvaddsp vs27, vs27, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs0, vs8, 2 xxpermdi vs26, vs2, vs10, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs8, vs9, save_permute_1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs8, vs0, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs0, vs8, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs8, vs0, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs0, vs8, 3 +#endif #endif stxv vs24, 0(CO) stxv vs26, 0(T1) @@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) vspltisb v10, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs38, (64+\OffsetA)(AO) lxvp vs40, (64+32+\OffsetA)(AO) .endm @@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 3, 35, 40 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs35, vs43, vs6, vs7 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 xxperm vs4, vs5, vs28 xxperm vs6, vs7, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs26, 32(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) + stxv vs6, 32(CO) + stxv vs4, 48(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) stxv vs4, 32(CO) stxv vs6, 48(CO) +#endif #endif addi CO, CO, 64 .endm @@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxv vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, (32+\OffsetA)(AO) .endm @@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 1, 35, 36 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs24, 0(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) +#endif #endif addi CO, CO, 32 .endm @@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 +#else xxperm vs0, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs0 @@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs37, vs1 MULT_APLHA_PART2 vs32, vs40, vs37, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs37, vs1, save_permute_1 +#else xxperm vs37, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs36, vs36, vs37 diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c index 70b50809e..d6a91f079 100644 --- a/kernel/power/cscal_microk_power10.c +++ b/kernel/power/cscal_microk_power10.c @@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) { __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif __asm__ ( "dcbt 0, %2 \n\t" diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index c2fde1c44..4d9b9ccd6 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "cswap_microk_power10.c" #elif defined(POWER10) -#include "cswap_microk_power8.c" +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 35390dd24..9ed0af767 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "dasum_microk_power10.c" #elif defined(POWER10) -#include "dasum_microk_power8.c" +#include "dasum_microk_power10.c" #endif #endif - #ifndef HAVE_KERNEL_16 static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) @@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c new file mode 100644 index 000000000..ecdc3e5c6 --- /dev/null +++ b/kernel/power/dgemm_small_kernel_nn_power10.c @@ -0,0 +1,923 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if defined(_AIX) +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); +#endif +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); +#endif + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+2); \ + ra2 = vec_xl(0, A+((K)*lda)+M+4); \ + ra3 = vec_xl(0, A+((K)*lda)+M+6); + +#define LOAD_A_1x4(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+2); \ + +#define LOAD_A_1x2(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); + +#define LOAD_A_1x1(K, M) \ + ra0 = vec_splats(A[((K)*lda)+M+0]); + +#define LOAD_BTP_8x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb2, t0, t1); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + LOAD_PAIR(pb1, t0, t1); \ + t0 = vec_mergel(rb4, rb5); \ + t1 = vec_mergel(rb6, rb7); \ + LOAD_PAIR(pb3, t0, t1); + +#define LOAD_BTP_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \ + LOAD_PAIR(pb1, rb2, rb3); + +#define LOAD_BTP_4x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb1, t0, t1); + +#define LOAD_BTP_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); + +#define LOAD_BTP_2x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ + t1 = vec_mergel(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); + +#define LOAD_BTP_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); + +#define LOAD_B_1x1(N, K) \ + rb0 = vec_splats(B[((N)*ldb)+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_B(pb0, pb1, offset) \ + *((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \ + *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1; + +#define LOAD_PACKED_B(pb0, pb1, offset) \ + pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \ + pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset))); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + double *packB; + if (has_packing) packB = (double *)malloc(K*8*sizeof(double)); + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (n = 0; n < n8; n += 8) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (has_packing) { + if (m == 0) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb0, pb1, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb2, pb3, 8); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb0, pb1, 0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; m < M; m++) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n+0, m+0); + SAVE_4x1_ACC(&acc1, n+4, m+0); + } + } + + for (; n < n4; n += 4) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; m < M; m++) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n, m); + } + } + + for (; n < n2; n += 2) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + } + + for (; m < M; m++) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x1_ACC(&acc0, n+0, m+0); + } + } + + for (; n < N; n++) { + for (m = 0; m < m8; m += 8) { + vector double result = ((vector double){0.,0.}); + vector double result1 = ((vector double){0.,0.}); + vector double result2 = ((vector double){0.,0.}); + vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + + for (; m < m4; m += 4) { + vector double result = ((vector double){0.,0.}); + vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + + for (; m < m2; m += 2) { + vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + + for (; m < M; m++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m+k*lda] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free(packB); + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_nt_power10.c b/kernel/power/dgemm_small_kernel_nt_power10.c new file mode 100644 index 000000000..7cc8c9f6c --- /dev/null +++ b/kernel/power/dgemm_small_kernel_nt_power10.c @@ -0,0 +1,581 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+2); \ + ra2 = vec_xl(0, A+(K*lda)+M+4); \ + ra3 = vec_xl(0, A+(K*lda)+M+6); + +#define LOAD_A_1x4(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+2); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]); + +#define LOAD_BP_1x8(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + +#define LOAD_BP_1x4(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + +#define LOAD_BP_1x2(K, N) \ + t0 = vec_xl(0, B+(K*ldb)+N); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); \ + rb2 = vec_xl(0, B+(K*ldb)+N+4); \ + rb3 = vec_xl(0, B+(K*ldb)+N+6); \ + +#define LOAD_B_1x4(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); + +#define LOAD_B_1x2(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x8(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + SAVE_4x1_VSR(result2, n+4, m); + SAVE_4x1_VSR(result3, n+6, m); + } + + for (; n < n4; n += 4) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x4(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + } + + for (; n < n2; n += 2) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[k*lda+m] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c new file mode 100644 index 000000000..93a942b02 --- /dev/null +++ b/kernel/power/dgemm_small_kernel_tn_power10.c @@ -0,0 +1,882 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if defined(_AIX) +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); +#endif +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); +#endif + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra2, ra3); \ + t1 = vec_mergel(ra2, ra3); \ + ra2 = t0; \ + ra3 = t1; \ + ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ + ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra4 = t0; \ + ra5 = t1; \ + ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ + ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ + t0 = vec_mergeh(ra6, ra7); \ + t1 = vec_mergel(ra6, ra7); \ + ra6 = t0; \ + ra7 = t1; + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(K, M) \ + ra0 = vec_splats(A[((M+0)*lda)+K+0]); + +#define LOAD_BTP_8x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb2, t0, t1); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + LOAD_PAIR(pb1, t0, t1); \ + t0 = vec_mergel(rb4, rb5); \ + t1 = vec_mergel(rb6, rb7); \ + LOAD_PAIR(pb3, t0, t1); + +#define LOAD_BTP_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb1, rb0, rb1); + +#define LOAD_BTP_4x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb1, t0, t1); + +#define LOAD_BTP_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); + +#define LOAD_BTP_2x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ + t1 = vec_mergel(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); + +#define LOAD_BTP_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_MMA_1ACC_(acc, b0, a0) \ + __builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + // workaround to avoid register spilling + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC_(acc0, pb0, ra0); + KERNEL_MMA_1ACC_(acc1, pb0, ra1); + LOAD_AT_4x1(m+4, k); + KERNEL_MMA_1ACC_(acc2, pb0, ra0); + KERNEL_MMA_1ACC_(acc3, pb0, ra1); + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n+4, k); + KERNEL_MMA_1ACC_(acc4, pb0, ra0); + KERNEL_MMA_1ACC_(acc5, pb0, ra1); + LOAD_AT_4x1(m+4, k); + KERNEL_MMA_1ACC_(acc6, pb0, ra0); + KERNEL_MMA_1ACC_(acc7, pb0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc4, n+4, m+0); + SAVE_4x2_ACC(&acc6, n+4, m+4); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc3, n+0, m+6); + SAVE_4x2_ACC(&acc5, n+4, m+2); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2); + KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + KERNEL_MMA_1ACC(pb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + KERNEL_MMA_1ACC(pb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n+0, m+0); + SAVE_4x1_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x1_ACC(&acc0, n+0, m+0); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_tt_power10.c b/kernel/power/dgemm_small_kernel_tt_power10.c new file mode 100644 index 000000000..b47b6201f --- /dev/null +++ b/kernel/power/dgemm_small_kernel_tt_power10.c @@ -0,0 +1,829 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; \ + ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ + ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ + ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ + ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergeh(ra6, ra7); \ + t2 = vec_mergel(ra4, ra5); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = t0; \ + ra5 = t2; \ + ra6 = t1; \ + ra7 = t3; + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); + +#define LOAD_BP_1x8(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + +#define LOAD_BP_1x4(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + +#define LOAD_BP_1x2(K, N) \ + t0 = vec_xl(0, B+((K)*ldb)+N); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); \ + rb2 = vec_xl(0, B+(K*ldb)+N+4); \ + rb3 = vec_xl(0, B+(K*ldb)+N+6); \ + +#define LOAD_B_1x4(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); + +#define LOAD_B_1x2(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*8)+0+offset); \ + vec_xst(ra1, 0, packA+(k*8)+2+offset); \ + vec_xst(ra2, 0, packA+(k*8)+4+offset); \ + vec_xst(ra3, 0, packA+(k*8)+6+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*8)+0+offset); \ + ra1 = vec_xl(0, packA+(k*8)+2+offset); \ + ra2 = vec_xl(0, packA+(k*8)+4+offset); \ + ra3 = vec_xl(0, packA+(k*8)+6+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + double *packA; + if (has_packing) packA = (double *)malloc(K*8*sizeof(double)); + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + PACK_A(ra1, ra3, ra5, ra7, 8); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_1ACC(pb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_1ACC(pb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + SAVE_4x1_VSR(result2, n+4, m); + SAVE_4x1_VSR(result3, n+6, m); + } + + for (; n < n4; n += 4) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + } + + for (; n < n2; n += 2) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if(has_packing) free(packA); + + return 0; +} diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index e47de2cb5..65743731e 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(32,%x9,0) // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha +#else "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha +#endif "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha - +#endif "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda @@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda "add %10, %10, %10 \n\t" // 2 * lda +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha @@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha +#endif "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda @@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "one%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" +#endif "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" +#else "xvmaddadp 36, 42, 35 \n\t" "xvmaddadp 37, 43, 35 \n\t" +#endif "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" +#else "xvmaddadp 36, 44, 32 \n\t" "xvmaddadp 37, 45, 32 \n\t" +#endif "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" +#else "xvmaddadp 36, 46, 33 \n\t" "xvmaddadp 37, 47, 33 \n\t" +#endif "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" +#else "xvmaddadp 36, 50, 48 \n\t" "xvmaddadp 37, 51, 48 \n\t" +#endif "lxvpx 50, %7, %11 \n\t" // a4[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" +#else "xvmaddadp 36, 52, 49 \n\t" "xvmaddadp 37, 53, 49 \n\t" +#endif "lxvpx 52, %8, %11 \n\t" // a5[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" +#else "xvmaddadp 36, 54, 38 \n\t" "xvmaddadp 37, 55, 38 \n\t" +#endif "lxvpx 54, %9, %11 \n\t" // a6[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "lxvpx 56, %10, %11 \n\t" // a7[0] "addi %11, %11, 32 \n\t" @@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "two%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" "xvmaddadp 36, 42, 35 \n\t" @@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "xvmaddadp 37, 55, 38 \n\t" "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "stxvp 36, 0( %2) \n\t" // y0, y1 : diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c index 3db4d5785..899b2a04b 100644 --- a/kernel/power/dgemv_t_power10.c +++ b/kernel/power/dgemv_t_power10.c @@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvp 40, 32(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(42,34,35) + XXMRGLD_S(43,34,35) + XXMRGHD_S(44,4,5) + XXMRGLD_S(45,4,5) +#else XXMRGLD_S(42,35,34) XXMRGHD_S(43,35,34) XXMRGLD_S(44,5,4) XXMRGHD_S(45,5,4) +#endif "xvadddp 42,42,43 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(46,6,7) + XXMRGLD_S(47,6,7) +#else XXMRGLD_S(46,7,6) XXMRGHD_S(47,7,6) - +#endif "xvadddp 44,44,45 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(48,8,9) + XXMRGLD_S(49,8,9) +#else XXMRGLD_S(48,9,8) XXMRGHD_S(49,9,8) - +#endif "xvadddp 46,46,47 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 38,42,36 \n\t" + "xvmaddadp 39,44,36 \n\t" +#else "xvmaddadp 39,42,36 \n\t" "xvmaddadp 38,44,36 \n\t" - +#endif "xvadddp 48,48,49 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 41,48,36 \n\t" +#else "xvmaddadp 41,46,36 \n\t" - +#endif "stxvp 38, 0(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 40,46,36 \n\t" +#else "xvmaddadp 40,48,36 \n\t" +#endif "stxvp 40, 32(%[y]) \n\t" : [memy] "+m" (*(double (*)[8])y), diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 30c7411cc..2aa0b8055 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "drot_microk_power10.c" #elif defined(POWER10) -#include "drot_microk_power8.c" +#include "drot_microk_power10.c" #endif #endif @@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 32c39a8f4..96c4e51bc 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "dscal_microk_power10.c" #elif defined(POWER10) -#include "dscal_microk_power8.c" +#include "dscal_microk_power10.c" #endif #endif @@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; @@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 12476965b..9e6229c6a 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "swap_microk_power10.c" #elif defined(POWER10) -#include "dswap_microk_power8.c" +#include "swap_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/gemm_small_kernel_permit_power10.c b/kernel/power/gemm_small_kernel_permit_power10.c new file mode 100644 index 000000000..9b38e457b --- /dev/null +++ b/kernel/power/gemm_small_kernel_permit_power10.c @@ -0,0 +1,84 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + +#if defined(DOUBLE) // dgemm + + // gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This + // issue affects both dgemm_nn and dgemm_tn. +#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)) + if (!transb) + return 0; +#endif + + if (MNK <= 54.0*54.0*54.0) + return 1; + +#else // sgemm + +#if defined(__GNUC__) && defined(__clang__) + // clang generates code with register spilling for the region of code with + // packing, thus, we had to disable this optimization for clang. Given that + // the packing on-demand used in this work is one of the reasons that lead the + // small kernels to outperform the normal flow (when MNK increases), with it + // disabled we had to reduce the MNK inputs used by the code generated by clang. + if (MNK > 84.0*84.0*84.0) + return 0; + + if (transa && !transb) { + // sgemm_tn works better when packing on-demand is used + if (MNK <= 64.0*64.0*64.0 && K >= 4) + return 1; + else + return 0; + } + +#else // gcc + + if (MNK > 100.0*100.0*100.0) + return 0; + +#endif + + // Multi-threading execution outperforms (or approaches) the execution of the + // small kernel. + if (num_cpu_avail(3) > 1) { + if (MNK <= 64.0*64.0*64.0) + return 1; + } else { + return 1; + } + +#endif + + return 0; +} diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 991d27508..af692a7fa 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "sasum_microk_power10.c" #elif defined(POWER10) -#include "sasum_microk_power8.c" +#include "sasum_microk_power10.c" #endif #endif @@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sgemm_small_kernel_nn_power10.c b/kernel/power/sgemm_small_kernel_nn_power10.c new file mode 100644 index 000000000..59222a436 --- /dev/null +++ b/kernel/power/sgemm_small_kernel_nn_power10.c @@ -0,0 +1,1563 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() \ + __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x16(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+4); \ + ra2 = vec_xl(0, A+((K)*lda)+M+8); \ + ra3 = vec_xl(0, A+((K)*lda)+M+12); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+4); + +#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+((K)*lda)+M); + +#define LOAD_A_2x2(K, M) \ + ra0 = vec_splats(A[K*lda+M]); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 3); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+((K)*lda)+M, 8); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[(K)*lda+M]); + +#define LOAD_BT_16x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); \ + rb8 = vec_xl(0, B+(N+8)*ldb+K); \ + rb9 = vec_xl(0, B+(N+9)*ldb+K); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergel(rb8, rb9); \ + rb10 = vec_xl(0, B+(N+10)*ldb+K); \ + rb11 = vec_xl(0, B+(N+11)*ldb+K); \ + t2 = vec_mergeh(rb10, rb11); \ + t3 = vec_mergel(rb10, rb11); \ + rb8 = vec_xxpermdi(t0, t2, 0b00); \ + rb9 = vec_xxpermdi(t0, t2, 0b11); \ + rb10 = vec_xxpermdi(t1, t3, 0b00); \ + rb11 = vec_xxpermdi(t1, t3, 0b11); \ + rb12 = vec_xl(0, B+(N+12)*ldb+K); \ + rb13 = vec_xl(0, B+(N+13)*ldb+K); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergel(rb12, rb13); \ + rb14 = vec_xl(0, B+(N+14)*ldb+K); \ + rb15 = vec_xl(0, B+(N+15)*ldb+K); \ + t2 = vec_mergeh(rb14, rb15); \ + t3 = vec_mergel(rb14, rb15); \ + rb12 = vec_xxpermdi(t0, t2, 0b00); \ + rb13 = vec_xxpermdi(t0, t2, 0b11); \ + rb14 = vec_xxpermdi(t1, t3, 0b00); \ + rb15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_16x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); \ + rb8 = vec_xl_len(B+(N+8)*ldb+K, 8); \ + rb9 = vec_xl_len(B+(N+9)*ldb+K, 8); \ + t0 = vec_mergeh(rb8, rb9); \ + rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \ + rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \ + t1 = vec_mergeh(rb10, rb11); \ + rb4 = vec_xxpermdi(t0, t1, 0b00); \ + rb5 = vec_xxpermdi(t0, t1, 0b11); \ + rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \ + rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \ + t0 = vec_mergeh(rb12, rb13); \ + rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \ + rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \ + t1 = vec_mergeh(rb14, rb15); \ + rb6 = vec_xxpermdi(t0, t1, 0b00); \ + rb7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_16x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1); \ + rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \ + rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \ + rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \ + rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3); + +#define LOAD_BT_8x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_8x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); + +#define LOAD_BT_4x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_4x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); + +#define LOAD_BT_2x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + t2 = vec_mergel(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; \ + rb2 = t2; \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(vec_extract(t2,2), rb3, 0); \ + rb3 = vec_insert(vec_extract(t2,3), rb3, 1); + +#define LOAD_BT_2x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergee(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; + +#define LOAD_BT_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_2x2(N, K) \ + rb0 = vec_splats(B[(N+0)*ldb+K]); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3); + +#define LOAD_B_2x1(N, K) \ + rb0 = vec_insert(B[(n+0)*ldb+k], rb0, 0); \ + rb0 = vec_insert(B[(n+1)*ldb+k], rb0, 1); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_B(rb0, rb1, rb2, rb3, offset) \ + vec_xst(rb0, 0, packB+(k*16)+0+offset); \ + vec_xst(rb1, 0, packB+(k*16)+4+offset); \ + vec_xst(rb2, 0, packB+(k*16)+8+offset); \ + vec_xst(rb3, 0, packB+(k*16)+12+offset); + +#define LOAD_PACKED_B(rb0, rb1, rb2, rb3, offset) \ + rb0 = vec_xl(0, packB+(k*16)+0+offset); \ + rb1 = vec_xl(0, packB+(k*16)+4+offset); \ + rb2 = vec_xl(0, packB+(k*16)+8+offset); \ + rb3 = vec_xl(0, packB+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packB; + if (has_packing) packB = (float *)malloc(K*16*sizeof(float)); + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + + for (n = 0; n < n16; n += 16) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (m == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb4, rb8, rb12, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb1, rb5, rb9, rb13, 16); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb2, rb6, rb10, rb14, 32); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb3, rb7, rb11, rb15, 48); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb2, rb4, rb6, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb1, rb3, rb5, rb7, 16); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb1, rb2, rb3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_16x4(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_16x2(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_16x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13); + LOAD_A_1x1(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14); + LOAD_A_1x1(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + } + + for (; n < n8; n += 8) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb0, rb4, rb0, rb4, rb0, rb4, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb1, rb5, rb1, rb5, rb1, rb5, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb2, rb6, rb2, rb6, rb2, rb6, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb3, rb7, rb3, rb7, rb3, rb7, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb0, rb2, rb0, rb2, rb0, rb2, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb1, rb3, rb1, rb3, rb1, rb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc6, n+0, m+12); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb0, rb4, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb1, rb5, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb2, rb6, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb3, rb7, ra0, ra0, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb0, rb2, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb1, rb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_8x4(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_8x2(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_8x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + } + + for (; n < n4; n += 4) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_4x4(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_1VSR(ra0, rb1); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_1VSR(ra0, rb2); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_1VSR(ra0, rb3); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_4x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_1VSR(ra0, rb1); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_4x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n+0, m); + } + } + + for (; n < n2; n += 2) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; m < m2; m += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(k, m); + LOAD_B_2x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; m < M; m++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_2x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + } + + for (; n < N; n++) { + for (m = 0; m < m16; m += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + + for (; m < m8; m += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + + for (; m < m4; m += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + + for (; m < m2; m += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + + for (; m < M; m++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m+k*lda] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packB); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_nt_power10.c b/kernel/power/sgemm_small_kernel_nt_power10.c new file mode 100644 index 000000000..20d3c6b0e --- /dev/null +++ b/kernel/power/sgemm_small_kernel_nt_power10.c @@ -0,0 +1,887 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x16(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+4); \ + ra2 = vec_xl(0, A+(K*lda)+M+8); \ + ra3 = vec_xl(0, A+(K*lda)+M+12); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+4); + +#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M); + +#define LOAD_A_2x2(K, M) \ + ra0 = vec_splats(A[K*lda+M+0]); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 3); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]); + +#define LOAD_B_1x16(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+4); \ + rb2 = vec_xl(0, B+(K*ldb)+N+8); \ + rb3 = vec_xl(0, B+(K*ldb)+N+12); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+4); + +#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N); + +#define LOAD_B_2x2(K, N) \ + rb0 = vec_splats(B[K*ldb+N]); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); + +#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc6, n+0, m+12); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; n < N; n++) { + vector float result = ((vector float){0., 0., 0., 0.}); + vector float result1 = ((vector float){0., 0., 0., 0.}); + vector float result2 = ((vector float){0., 0., 0., 0.}); + vector float result3 = ((vector float){0., 0., 0., 0.}); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(k, m); + LOAD_B_2x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n16; n += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x16(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + + for (; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x8(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x4(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n+0, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n+0, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[k*lda+m] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_tn_power10.c b/kernel/power/sgemm_small_kernel_tn_power10.c new file mode 100644 index 000000000..64ecddbba --- /dev/null +++ b/kernel/power/sgemm_small_kernel_tn_power10.c @@ -0,0 +1,1678 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_16x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); \ + ra8 = vec_xl(0, A+(M+8)*lda+K); \ + ra9 = vec_xl(0, A+(M+9)*lda+K); \ + t0 = vec_mergeh(ra8, ra9); \ + t1 = vec_mergel(ra8, ra9); \ + ra10 = vec_xl(0, A+(M+10)*lda+K); \ + ra11 = vec_xl(0, A+(M+11)*lda+K); \ + t2 = vec_mergeh(ra10, ra11); \ + t3 = vec_mergel(ra10, ra11); \ + ra8 = vec_xxpermdi(t0, t2, 0b00); \ + ra9 = vec_xxpermdi(t0, t2, 0b11); \ + ra10 = vec_xxpermdi(t1, t3, 0b00); \ + ra11 = vec_xxpermdi(t1, t3, 0b11); \ + ra12 = vec_xl(0, A+(M+12)*lda+K); \ + ra13 = vec_xl(0, A+(M+13)*lda+K); \ + t0 = vec_mergeh(ra12, ra13); \ + t1 = vec_mergel(ra12, ra13); \ + ra14 = vec_xl(0, A+(M+14)*lda+K); \ + ra15 = vec_xl(0, A+(M+15)*lda+K); \ + t2 = vec_mergeh(ra14, ra15); \ + t3 = vec_mergel(ra14, ra15); \ + ra12 = vec_xxpermdi(t0, t2, 0b00); \ + ra13 = vec_xxpermdi(t0, t2, 0b11); \ + ra14 = vec_xxpermdi(t1, t3, 0b00); \ + ra15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_16x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); \ + ra8 = vec_xl_len(A+(M+8)*lda+K, 8); \ + ra9 = vec_xl_len(A+(M+9)*lda+K, 8); \ + t0 = vec_mergeh(ra8, ra9); \ + ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \ + ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \ + t1 = vec_mergeh(ra10, ra11); \ + ra4 = vec_xxpermdi(t0, t1, 0b00); \ + ra5 = vec_xxpermdi(t0, t1, 0b11); \ + ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \ + ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \ + t0 = vec_mergeh(ra12, ra13); \ + ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \ + ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \ + t1 = vec_mergeh(ra14, ra15); \ + ra6 = vec_xxpermdi(t0, t1, 0b00); \ + ra7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_16x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1); \ + ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \ + ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \ + ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \ + ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3); + +#define LOAD_AT_8x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); + +#define LOAD_AT_4x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); + +#define LOAD_AT_2x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + t2 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = t2; \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(vec_extract(t2, 2), ra3, 0); \ + ra3 = vec_insert(vec_extract(t2, 3), ra3, 1); + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergee(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_2x2(M, K) \ + ra0 = vec_splats(A[(M+0)*lda+K]); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3); + +#define LOAD_A_2x1(M, K) \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); + +#define LOAD_BT_16x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); \ + rb8 = vec_xl(0, B+(N+8)*ldb+K); \ + rb9 = vec_xl(0, B+(N+9)*ldb+K); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergel(rb8, rb9); \ + rb10 = vec_xl(0, B+(N+10)*ldb+K); \ + rb11 = vec_xl(0, B+(N+11)*ldb+K); \ + t2 = vec_mergeh(rb10, rb11); \ + t3 = vec_mergel(rb10, rb11); \ + rb8 = vec_xxpermdi(t0, t2, 0b00); \ + rb9 = vec_xxpermdi(t0, t2, 0b11); \ + rb10 = vec_xxpermdi(t1, t3, 0b00); \ + rb11 = vec_xxpermdi(t1, t3, 0b11); \ + rb12 = vec_xl(0, B+(N+12)*ldb+K); \ + rb13 = vec_xl(0, B+(N+13)*ldb+K); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergel(rb12, rb13); \ + rb14 = vec_xl(0, B+(N+14)*ldb+K); \ + rb15 = vec_xl(0, B+(N+15)*ldb+K); \ + t2 = vec_mergeh(rb14, rb15); \ + t3 = vec_mergel(rb14, rb15); \ + rb12 = vec_xxpermdi(t0, t2, 0b00); \ + rb13 = vec_xxpermdi(t0, t2, 0b11); \ + rb14 = vec_xxpermdi(t1, t3, 0b00); \ + rb15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_16x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); \ + rb8 = vec_xl_len(B+(N+8)*ldb+K, 8); \ + rb9 = vec_xl_len(B+(N+9)*ldb+K, 8); \ + rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \ + rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergeh(rb10, rb11); \ + rb4 = vec_xxpermdi(t0, t1, 0b00); \ + rb5 = vec_xxpermdi(t0, t1, 0b11); \ + rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \ + rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \ + rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \ + rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergeh(rb14, rb15); \ + rb6 = vec_xxpermdi(t0, t1, 0b00); \ + rb7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_16x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1); \ + rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \ + rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \ + rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \ + rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3); + +#define LOAD_BT_8x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_8x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); + +#define LOAD_BT_4x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_4x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); + +#define LOAD_BT_2x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + t2 = vec_mergel(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; \ + rb2 = t2; \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(vec_extract(t2,2), rb3, 0); \ + rb3 = vec_insert(vec_extract(t2,3), rb3, 1); + +#define LOAD_BT_2x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergee(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; + +#define LOAD_BT_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_2x2(N, K) \ + rb0 = vec_splats(B[(N+0)*ldb+K]); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3); + +#define LOAD_B_2x1(N, K) \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + PACK_A(ra0, ra4, ra8, ra12, 0); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + PACK_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + PACK_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + PACK_A(ra3, ra7, ra11, ra15, 48); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + PACK_A(ra1, ra3, ra5, ra7, 16); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc5, n+0, m+12); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n+0, m+0); + SAVE_2x4_ACC(&acc1, n+0, m+4); + SAVE_2x4_ACC(&acc2, n+0, m+8); + SAVE_2x4_ACC(&acc3, n+0, m+12); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb8, rb8, rb12, rb12, + ra0, ra4, ra0, ra4, ra0, ra4, ra0, ra4); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb9, rb9, rb13, rb13, + ra1, ra5, ra1, ra5, ra1, ra5, ra1, ra5); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb10, rb10, rb14, rb14, + ra2, ra6, ra2, ra6, ra2, ra6, ra2, ra6); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb11, rb11, rb15, rb15, + ra3, ra7, ra3, ra7, ra3, ra7, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb4, rb4, rb6, rb6, + ra0, ra2, ra0, ra2, ra0, ra2, ra0, ra2); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb5, rb5, rb7, rb7, + ra1, ra3, ra1, ra3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb2, rb2, rb3, rb3, + ra0, ra1, ra0, ra1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc4, n+8, m+0); + SAVE_4x4_ACC(&acc5, n+8, m+4); + SAVE_4x4_ACC(&acc6, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb4, rb4, ra0, ra4, ra0, ra4); + KERNEL_MMA_4ACC(rb1, rb1, rb5, rb5, ra1, ra5, ra1, ra5); + KERNEL_MMA_4ACC(rb2, rb2, rb6, rb6, ra2, ra6, ra2, ra6); + KERNEL_MMA_4ACC(rb3, rb3, rb7, rb7, ra3, ra7, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb2, rb2, ra0, ra2, ra0, ra2); + KERNEL_MMA_4ACC(rb1, rb1, rb3, rb3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb1, rb1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5); + KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6); + KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5); + KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6); + KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra1, ra1, ra1, ra1); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra2, ra2, ra2, ra2); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1); + KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2); + KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_1VSR(ra1, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_1VSR(ra2, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_1VSR(ra3, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_1VSR(ra1, rb0); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1); + KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2); + KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(m, k); + LOAD_B_2x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0 = ((vector float){0.,0.,0.,0.}); + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(m, k); + LOAD_BT_8x4(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5); + LOAD_A_1x1(m, k+2); + KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6); + LOAD_A_1x1(m, k+3); + KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(m, k); + LOAD_BT_8x2(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3); + } + for (; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(m, k); + LOAD_BT_4x4(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_1VSR(ra0, rb1); + LOAD_A_1x1(m, k+2); + KERNEL_VMADD_1VSR(ra0, rb2); + LOAD_A_1x1(m, k+3); + KERNEL_VMADD_1VSR(ra0, rb3); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(m, k); + LOAD_BT_4x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_1VSR(ra0, rb1); + } + for (; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_2x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_tt_power10.c b/kernel/power/sgemm_small_kernel_tt_power10.c new file mode 100644 index 000000000..71bc7b937 --- /dev/null +++ b/kernel/power/sgemm_small_kernel_tt_power10.c @@ -0,0 +1,1559 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_16x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); \ + ra8 = vec_xl(0, A+(M+8)*lda+K); \ + ra9 = vec_xl(0, A+(M+9)*lda+K); \ + t0 = vec_mergeh(ra8, ra9); \ + t1 = vec_mergel(ra8, ra9); \ + ra10 = vec_xl(0, A+(M+10)*lda+K); \ + ra11 = vec_xl(0, A+(M+11)*lda+K); \ + t2 = vec_mergeh(ra10, ra11); \ + t3 = vec_mergel(ra10, ra11); \ + ra8 = vec_xxpermdi(t0, t2, 0b00); \ + ra9 = vec_xxpermdi(t0, t2, 0b11); \ + ra10 = vec_xxpermdi(t1, t3, 0b00); \ + ra11 = vec_xxpermdi(t1, t3, 0b11); \ + ra12 = vec_xl(0, A+(M+12)*lda+K); \ + ra13 = vec_xl(0, A+(M+13)*lda+K); \ + t0 = vec_mergeh(ra12, ra13); \ + t1 = vec_mergel(ra12, ra13); \ + ra14 = vec_xl(0, A+(M+14)*lda+K); \ + ra15 = vec_xl(0, A+(M+15)*lda+K); \ + t2 = vec_mergeh(ra14, ra15); \ + t3 = vec_mergel(ra14, ra15); \ + ra12 = vec_xxpermdi(t0, t2, 0b00); \ + ra13 = vec_xxpermdi(t0, t2, 0b11); \ + ra14 = vec_xxpermdi(t1, t3, 0b00); \ + ra15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_16x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); \ + ra8 = vec_xl_len(A+(M+8)*lda+K, 8); \ + ra9 = vec_xl_len(A+(M+9)*lda+K, 8); \ + t0 = vec_mergeh(ra8, ra9); \ + ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \ + ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \ + t1 = vec_mergeh(ra10, ra11); \ + ra4 = vec_xxpermdi(t0, t1, 0b00); \ + ra5 = vec_xxpermdi(t0, t1, 0b11); \ + ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \ + ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \ + t0 = vec_mergeh(ra12, ra13); \ + ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \ + ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \ + t1 = vec_mergeh(ra14, ra15); \ + ra6 = vec_xxpermdi(t0, t1, 0b00); \ + ra7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_16x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1); \ + ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \ + ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \ + ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \ + ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3); + +#define LOAD_AT_8x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); + +#define LOAD_AT_4x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); + +#define LOAD_AT_2x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + t2 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = t2; \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(vec_extract(t2,2), ra3, 0); \ + ra3 = vec_insert(vec_extract(t2,3), ra3, 1); + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergee(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_2x2(M, K) \ + ra0 = vec_splats(A[M*lda+K]); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[M*lda+K]); + +#define LOAD_B_1x16(K, N) \ + rb0 = vec_xl(0, B+((K)*ldb)+N+0); \ + rb1 = vec_xl(0, B+((K)*ldb)+N+4); \ + rb2 = vec_xl(0, B+((K)*ldb)+N+8); \ + rb3 = vec_xl(0, B+((K)*ldb)+N+12); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+((K)*ldb)+N+0); \ + rb1 = vec_xl(0, B+((K)*ldb)+N+4); + +#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+((K)*ldb)+N); + +#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+((K)*ldb)+N, 8); + +#define LOAD_B_2x2(K, N) \ + rb0 = vec_splats(B[K*ldb+N]); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + PACK_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + PACK_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + PACK_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + PACK_A(ra3, ra7, ra11, ra15, 48); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + PACK_A(ra1, ra3, ra5, ra7, 16); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + SAVE_4x4_ACC(&acc4, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_B_1x2(k, n); + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x2(k+1, n); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra4, ra4, ra4, ra4); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra1, ra1, ra1, ra1, ra5, ra5, ra5, ra5); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra2, ra2, ra2, ra2, ra6, ra6, ra6, ra6); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra3, ra3, ra3, ra3, ra7, ra7, ra7, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra1, ra1, ra1, ra1, ra3, ra3, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra4, ra4); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra5, ra5); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra2, ra2, ra6, ra6); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra3, ra3, ra7, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra2, ra2); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_1VSR(ra1, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_1VSR(ra2, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_1VSR(ra3, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_1VSR(ra1, rb0); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(m, k); + LOAD_B_2x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + ra0 = vec_insert(A[(m+1)*lda+k], ra0, 1); + ra0 = vec_insert(A[(m+1)*lda+k], ra0, 3); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n16; n += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + + for (; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 5a0d4b12e..3e4f93e2a 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "srot_microk_power10.c" #elif defined(POWER10) -#include "srot_microk_power8.c" +#include "srot_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 9ae9ccab8..65572a8c1 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "sscal_microk_power10.c" #elif defined(POWER10) -#include "sscal_microk_power8.c" +#include "sscal_microk_power10.c" #endif #endif @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; @@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 955ed02f0..dd249fd36 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "swap_microk_power10.c" #elif defined(POWER10) -#include "sswap_microk_power8.c" +#include "swap_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 64 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c index 8e593bbfa..b03508b09 100644 --- a/kernel/power/zaxpy_microk_power10.c +++ b/kernel/power/zaxpy_microk_power10.c @@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, double alpha_r, double alpha_i) { #if !defined(CONJ) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + static const double mvec[2] = { -1.0, 1.0 }; +#else + static const double mvec[2] = { 1.0, -1.0 }; +#endif +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) static const double mvec[2] = { 1.0, -1.0 }; #else static const double mvec[2] = { -1.0, 1.0 }; +#endif #endif const double *mvecp = mvec; diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S index fca389e69..afee8f183 100644 --- a/kernel/power/zgemm_kernel_power10.S +++ b/kernel/power/zgemm_kernel_power10.S @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r0, FLINK_SAVE(SP) -#if defined(linux) || defined(__FreeBSD__) +#if defined(linux) || defined(__FreeBSD__) || defined(_AIX) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S index 42f9c5ad4..e5e5ec0e6 100644 --- a/kernel/power/zgemm_macros_power10.S +++ b/kernel/power/zgemm_macros_power10.S @@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef TRMMKERNEL lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#else xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif #endif .endm /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#endif .endm /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#endif .endm /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ @@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1,\VSIN1,\VSIN2 + xxmrgld \VSOUT2,\VSIN1,\VSIN2 +#else xxmrghd \VSOUT1,\VSIN2,\VSIN1 xxmrgld \VSOUT2,\VSIN2,\VSIN1 +#endif .endm @@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 #ifndef TRMMKERNEL lxv vs50, (\LOFFSET)(\BASE_REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd vs46,vs50,vs50 + xxmrgld vs47,vs50,vs50 +#else xxmrgld vs46,vs50,vs50 xxmrghd vs47,vs50,vs50 +#endif #endif RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 MULT_APLHA_PART1 vs34,vs36, vs46,vs47 MULT_APLHA_PART2 vs34,vs36, vs46,vs47 UNPACK_FOR_STORE vs46,vs47,vs39,vs41 +#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) xxmrghd vs39,vs47,vs46 +#endif stxv vs39, (\LOFFSET)(\BASE_REG) .endm @@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs44, vs50 + xvf64gerpp 3, vs46, vs50 + xvf64gerpp 4, vs40, vs51 + xvf64gerpp 5, vs42, vs51 + xvf64gerpp 6, vs44, vs51 + xvf64gerpp 7, vs46, vs51 +#else xvf64gerpp 0, vs40, vs51 xvf64gerpp 1, vs42, vs51 xvf64gerpp 2, vs44, vs51 @@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs42, vs50 xvf64gerpp 6, vs44, vs50 xvf64gerpp 7, vs46, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP4(\Index,64) @@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x8 OffsetA,OffsetB +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif xxpermdi vs32, vs16, vs17, 0b01 xxpermdi vs33, vs16, vs17, 0b10 xxpermdi vs34, vs18, vs19, 0b01 @@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs28, vs29, 0b10 xxpermdi vs46, vs30, vs31, 0b01 xxpermdi vs47, vs30, vs31, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs16, vs32, vs32 + xxlor vs17, vs33, vs33 + xxlor vs18, vs34, vs34 + xxlor vs19, vs35, vs35 + xxlor vs20, vs36, vs36 + xxlor vs21, vs37, vs37 + xxlor vs22, vs38, vs38 + xxlor vs23, vs39, vs39 + xxlor vs24, vs40, vs40 + xxlor vs25, vs41, vs41 + xxlor vs26, vs42, vs42 + xxlor vs27, vs43, vs43 + xxlor vs28, vs44, vs44 + xxlor vs29, vs45, vs45 + xxlor vs30, vs46, vs46 + xxlor vs31, vs47, vs47 +#else xxlor vs18, vs32, vs32 xxlor vs19, vs33, vs33 xxlor vs16, vs34, vs34 @@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs31, vs45, vs45 xxlor vs28, vs46, vs46 xxlor vs29, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 addi CO, CO, 128 @@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs42, vs51 - xvf64gerpp 2, vs40, vs50 - xvf64gerpp 3, vs42, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs40, vs51 + xvf64gerpp 3, vs42, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP4(\Index,64) @@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x4 OffsetA, OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 addi CO, CO, 64 @@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_2 Index, IsLast lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs40, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs40, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP4(\Index,64) @@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x2 OffsetA,OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 SAVE2 vs4,vs5,vs6,vs7,T1,0 addi CO, CO, 32 @@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs36, vs49 - xvf64gerpp 3, vs38, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 - xvf64gerpp 2, vs44, vs48 - xvf64gerpp 3, vs46, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 + xvf64gerpp 2, vs44, vs49 + xvf64gerpp 3, vs46, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP2(\Index,32) @@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 addi CO, CO, 128 .endm @@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP2(\Index,32) @@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 addi CO, CO, 64 .endm @@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 0, vs40, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 0, vs40, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP2(\Index,32) @@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs33, vs0, vs1, 0b10 xxpermdi vs34, vs2, vs3, 0b01 xxpermdi vs35, vs2, vs3, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 xxlor vs1, vs35, vs35 +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 addi CO, CO, 32 diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index d3bf60ca7..e42eafaba 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif HAVE_KERNEL_4x4_VEC -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 59ddc149f..0068138e8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#elif defined(POWER10) #if defined(DOUBLE) #include "zscal_microk_power10.c" #else #include "cscal_microk_power10.c" #endif -#elif defined(POWER10) -#if defined(DOUBLE) -#include "zscal_microk_power8.c" -#endif #endif #endif diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c index 15b8323f4..af99b8648 100644 --- a/kernel/power/zscal_microk_power10.c +++ b/kernel/power/zscal_microk_power10.c @@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xsnegdp 33, %x10 \n\t" // -alpha_i XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i +#else XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i +#endif "lxvp 40, 0(%2) \n\t" "lxvp 42, 32(%2) \n\t" @@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 49, 49, 39 \n\t" "xvadddp 50, 50, %x3 \n\t" "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else "stxv 49, 0(%2) \n\t" "stxv 48, 16(%2) \n\t" "stxv 51, 32(%2) \n\t" "stxv 50, 48(%2) \n\t" +#endif "xvadddp 34, 34, %x5 \n\t" @@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 36, 36, %x7 \n\t" "xvadddp 37, 37, %x8 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else "stxv 35, 64(%2) \n\t" "stxv 34, 80(%2) \n\t" "stxv 37, 96(%2) \n\t" "stxv 36, 112(%2) \n\t" - +#endif "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" @@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 50, 50, %x3 \n\t" "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else "stxv 49, 0(%2) \n\t" "stxv 48, 16(%2) \n\t" "stxv 51, 32(%2) \n\t" "stxv 50, 48(%2) \n\t" - +#endif "xvadddp 34, 34, %x5 \n\t" "xvadddp 35, 35, %x6 \n\t" "xvadddp 36, 36, %x7 \n\t" "xvadddp 37, 37, %x8 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else "stxv 35, 64(%2) \n\t" "stxv 34, 80(%2) \n\t" "stxv 37, 96(%2) \n\t" "stxv 36, 112(%2) \n\t" - +#endif "#n=%1 x=%0=%2 alpha=(%9,%10) \n" : "+m" (*x), diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 908802b71..fe7871852 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#elif defined(POWER10) #include "cswap_microk_power10.c" -#elif defined(POWER10) -#include "zswap_microk_power8.c" #endif #endif diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 02a76f168..64323aa3a 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() @@ -44,6 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -58,7 +60,26 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; if(da_r == 0.0 && da_i == 0.0){ - memset(&x[0], 0, n * 2 * sizeof(FLOAT)); + gvl = VSETVL(n); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = inc_x * 2 * gvl; + vt = VFMVVF_FLOAT(0.0, gvl); + for(i=0,j=0; i < n/(gvl*2); i++){ + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); + + j += gvl*2; + ix += inc_xv*2; + } + for(; j < n; ){ + gvl = VSETVL(n-j); + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + j += gvl; + ix += inc_x * 2 * gvl; + } }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 19b7b5f0b..fe796be64 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1518,7 +1518,7 @@ static void init_parameter(void) { #endif #endif -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #ifdef DEBUG fprintf(stderr, "SkylakeX\n"); diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index fde9eba8e..0d71201d6 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index fddf7560f..e775b4d76 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 33afd2a61..d3d110811 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index b05bd6ee5..e56a768db 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index f960559a6..85a29ce57 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index cf842c9b5..5c128d7a4 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 63c44c27a..73174e424 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 4cb01e50a..ebe83ff40 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 09d5d8e43..b26ffb473 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 7d129e54c..c2c7caadc 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index d33599317..42526135c 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS new file mode 100644 index 000000000..88f574668 --- /dev/null +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -0,0 +1,14 @@ +include $(KERNELDIR)/KERNEL.COOPERLAKE + +SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c + +SBGEMM_BETA = sgemm_beta_skylakex.c +SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c +SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c +SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c +SBGEMMONCOPY = sbgemm_oncopy_16_spr.c +SBGEMMOTCOPY = sbgemm_otcopy_16_spr.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 6b4961bc2..d2d7de42a 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -20,6 +20,7 @@ SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_skylakex.c SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_skylakex.c +ifndef DYNAMIC_ARCH DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -27,6 +28,11 @@ DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +else +DGEMMKERNEL = dgemm_kernel_4x8_skylakex_2.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c +DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c +endif DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_skylakex.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index c19b98f02..7270a98bc 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 654cd351a..264776239 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 0ed02b8d8..3ca173c20 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index c2903b11f..3187e196c 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 6d75358a6..dc3f688c6 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 26437012c..2796b8270 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index e4b6622e6..5d0c32234 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index da68db0cd..f883d4f26 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemv_n_microk_nehalem-4.c" #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index a3bf28dc8..9688c6bf3 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d1270d20b..05c5c7f16 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_sandy-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 573377ee0..590776005 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 530ac8b1d..f196aa364 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 7b2845636..ff911c52b 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sbdot.c b/kernel/x86_64/sbdot.c index ef14fd618..a4e60b7c4 100644 --- a/kernel/x86_64/sbdot.c +++ b/kernel/x86_64/sbdot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(COOPERLAKE) +#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "sbdot_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c new file mode 100644 index 000000000..955db3163 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -0,0 +1,50 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +#define ALPHA_ONE +#include "sbgemm_kernel_16x16_spr_tmpl.c" +#undef ALPHA_ONE +#include "sbgemm_kernel_16x16_spr_tmpl.c" + + +int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOAT * iB, FLOAT * C, BLASLONG ldc) +{ + /* transport to Row Major matrix for AMX requirement */ + BLASLONG m, n; + IFLOAT *A, *B; + m = in; + n = im; + A = iB; + B = iA; + + if (alpha == 1.0f) + return sbgemm_kernel_spr_alpha_one(m, n, k, alpha, A, B, C, ldc); + else + return sbgemm_kernel_spr_alpha(m, n, k, alpha, A, B, C, ldc); +} diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c new file mode 100644 index 000000000..90e0a32c7 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c @@ -0,0 +1,530 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include +#include "common.h" + +#ifndef SBGEMM_KERNEL_SPR +#define SBGEMM_KERNEL_SPR +typedef struct { + char palette_id; + char start_row; + char dummy0[14]; // bytes 2-15 reserved, must be zero + short tile_colsb[8]; + char dummy1[16]; // bytes 32-47 reserved, must be zero + char tile_rows[8]; + char dummy2[16]; // bytes 56-63 reserved, must be zero +} tilecfg; + +/* tile0/tile1 -- A (m x 2k) + * tile2/tile3 -- B (2k x n) + * tile4-7 -- C (m x n) + */ +#define TCONF(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = k2>>1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_rows[5] = m; \ + cfg.tile_rows[6] = m; \ + cfg.tile_rows[7] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = k2<<1; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + cfg.tile_colsb[5] = n * 4; \ + cfg.tile_colsb[6] = n * 4; \ + cfg.tile_colsb[7] = n * 4; \ + _tile_loadconfig(&cfg); + +/* CONFIG for handling k2 and odd tail at the same time + * tile0 -- A (m x 2k) + * tile1 -- A (m x 1) + * tile2 -- B (2k x n) + * tile3 -- B (1 x n) + * tile4 -- C (m x n) + */ +#define TCONF_TAIL(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = 1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = 4; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + _tile_loadconfig(&cfg); + +#define T_A0 0 +#define T_A1 1 +#define T_B0 2 +#define T_B1 3 +#define T_C00 4 +#define T_C01 5 +#define T_C10 6 +#define T_C11 7 + +// FIXME: gcc11 seem have problem in tile load/store address calc, +// need to multiply with element size (2 or 4) here. +#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) +#define LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} +#define MASK_LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} +#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) +#define LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} +#define MASK_LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} + +#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) +#define MATMUL_TAIL(M, N) _tile_dpbf16ps(T_C00, T_A##M, T_B##N) +#define STORE_C(M, N) _tile_stored(T_C##M##N, ptr_c##M##N, ldc * 4) +#define LOAD_C_F(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) + +#endif // end of SBGEMM_KERNEL_SPR + +#ifdef ALPHA_ONE +#undef LOAD_C +#define LOAD_C(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) +#else +#undef LOAD_C +#define LOAD_C(M, N) _tile_zero(T_C##M##N) +#define ALPHA_STORE(N) \ + __m512 zmm_d##N = _mm512_loadu_ps(dst##N + noffset); \ + __m512 zmm_s##N = _mm512_loadu_ps(src##N + noffset); \ + zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \ + _mm512_storeu_ps(dst##N + noffset, zmm_d##N); +#define MASK_APLPHA_STORE(N) \ + __m512 zmm_d##N = _mm512_maskz_loadu_ps(mask, dst##N + noffset); \ + __m512 zmm_s##N = _mm512_maskz_loadu_ps(mask, src##N + noffset); \ + zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \ + _mm512_mask_storeu_ps(dst##N + noffset, mask, zmm_d##N); +#endif // end of ALPHA_ONE + + +#ifdef ALPHA_ONE +int sbgemm_kernel_spr_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#else +int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + /* Row Major matrix for AMX requirement */ + IFLOAT *ptr_a = A, *ptr_b = B; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c00, *ptr_c01, *ptr_c10, *ptr_c11; + + BLASLONG lda, ldb; + BLASLONG m_count = m; + BLASLONG n_count, k_count; + +#ifndef ALPHA_ONE + // make sure each row is 64 bytes aligned + BLASLONG cn = (n & 31) ? (n & ~31) + 32 : n; + FLOAT *raw_tmp_c; + if (k < 32) { + // only need to zero buff in this situation + raw_tmp_c = (FLOAT *)calloc(1, sizeof(FLOAT) * m * cn + 64); + } else { + raw_tmp_c = (FLOAT *)malloc(sizeof(FLOAT) * m * cn + 64); + } + // align buf to 64 byte boundary + FLOAT *tmp_c = (FLOAT *)(((uintptr_t) raw_tmp_c + 63) & ~(uintptr_t)63); + ptr_c = tmp_c; + BLASLONG ldc_o = ldc; + ldc = cn; +#endif + IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); + IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); + tilecfg cfg; + + if (k > 31) { + for (; m_count > 31; m_count -= 32) { + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c10 = ptr_c + 16 * ldc; + ptr_c11 = ptr_c10 + 16; + ptr_c += 32 * ldc; + n_count = n; + TCONF(cfg, 16, 16, 32); + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + LOAD_C(0, 0); LOAD_C(0, 1); + LOAD_C(1, 0); LOAD_C(1, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + STORE_C(1, 0); STORE_C(1, 1); + ptr_c00 += 32; + ptr_c01 += 32; + ptr_c10 += 32; + ptr_c11 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, 16, tail_n, 32); + LOAD_C(0, 0); + LOAD_C(1, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + MATMUL(1, 0); + } + STORE_C(0, 0); + STORE_C(1, 0); + ptr_c00 += tail_n; + ptr_c10 += tail_n; + } + ptr_a += 32 * k; + } + for (; m_count > 0; m_count -= 16) { + // process at most 16 m at a time + int tail_m = (m_count > 16) ? 16: m_count; + + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c += tail_m * ldc; + n_count = n; + TCONF(cfg, tail_m, 16, 32); + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + LOAD_C(0, 0); LOAD_C(0, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + ptr_c00 += 32; + ptr_c01 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, 32); + LOAD_C(0, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + } + STORE_C(0, 0); + ptr_c00 += tail_n; + } + ptr_a += tail_m * k; + } + } + + // process for k < 32 + BLASLONG k32 = k & ~31; + BLASLONG k2 = k & ~1; + if (k32 != k) { + int remain_k2 = k2 - k32; + m_count = m; + ptr_a = A; +#ifndef ALPHA_ONE + ptr_c = tmp_c; +#else + ptr_c = C; +#endif + if (remain_k2 > 0 && k2 != k) { // k%32 = 2x + 1 (x != 0) + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a1 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + if (n_count > 15) { + TCONF_TAIL(cfg, tail_m, 16, remain_k2); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + ptr_b1 = ptr_b + 16 * k2; + LOAD_C_F(0, 0); + LOAD_B(x, 0); LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k32; + ptr_b1 = ptr_b + tail_n * k2; + ldb = 2 * tail_n; + TCONF_TAIL(cfg, tail_m, tail_n, remain_k2); + LOAD_C_F(0, 0); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + LOAD_B(x, 0); MASK_LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + } + } + + } else if (remain_k2 > 0) { // k%32 = 2x + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + if (n_count > 15) { + TCONF(cfg, tail_m, 16, remain_k2); + LOAD_A(0, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + LOAD_C_F(0, 0); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_b0 = ptr_b + tail_n * k32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, remain_k2); + LOAD_C_F(0, 0); + LOAD_A(0, x); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } + } + } else { // k%32 = 1 + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + if (n_count > 15) { + TCONF(cfg, tail_m, 16, 2); + MASK_LOAD_A_TAIL(0, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k2; + LOAD_C_F(0, 0); + LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k2; + TCONF(cfg, tail_m, tail_n, 2); + LOAD_C_F(0, 0); + MASK_LOAD_A_TAIL(0, x); + MASK_LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } + } + + } + } +#ifndef ALPHA_ONE + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); + BLASLONG n16 = n & ~15; + BLASLONG noffset; + FLOAT *src0, *src1, *src2, *src3; + FLOAT *dst0, *dst1, *dst2, *dst3; + FLOAT *src = tmp_c; + FLOAT *dst = C; + m_count = m; + for (; m_count > 3; m_count -= 4) { + src0 = src; + src1 = src0 + ldc; + src2 = src1 + ldc; + src3 = src2 + ldc; + src += 4 * ldc; + + dst0 = dst; + dst1 = dst0 + ldc_o; + dst2 = dst1 + ldc_o; + dst3 = dst2 + ldc_o; + dst += 4 * ldc_o; + + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + ALPHA_STORE(1); + ALPHA_STORE(2); + ALPHA_STORE(3); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + MASK_APLPHA_STORE(1); + MASK_APLPHA_STORE(2); + MASK_APLPHA_STORE(3); + } + } + for (; m_count > 1; m_count -= 2) { + src0 = src; + src1 = src0 + ldc; + src += 2 * ldc; + + dst0 = dst; + dst1 = dst0 + ldc_o; + dst += 2 * ldc_o; + + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + ALPHA_STORE(1); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + MASK_APLPHA_STORE(1); + } + } + for (; m_count > 0; m_count -= 1) { + src0 = src; + dst0 = dst; + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + } + } + free(raw_tmp_c); +#endif + return 0; +} diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c new file mode 100644 index 000000000..ccb00ada1 --- /dev/null +++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c @@ -0,0 +1,128 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include "common.h" + +typedef struct { + char palette_id; + char start_row; + char dummy0[14]; // bytes 2-15 reserved, must be zero + short tile_colsb[8]; + char dummy1[16]; // bytes 32-47 reserved, must be zero + char tile_rows[8]; + char dummy2[16]; // bytes 56-63 reserved, must be zero +} tilecfg; + +#define T_16x32 0 +#define T_16xm 1 +#define T_nx32 2 +#define T_nxm 3 + +#define TCONF(cfg, m, n) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[T_16x32] = 16; \ + cfg.tile_colsb[T_16x32] = 64; \ + if (m) { \ + cfg.tile_rows[T_16xm] = 16; \ + cfg.tile_colsb[T_16xm] = m * 2; \ + } \ + if (n) { \ + cfg.tile_rows[T_nx32] = n; \ + cfg.tile_colsb[T_nx32] = 64; \ + } \ + if (m && n) { \ + cfg.tile_rows[T_nxm] = n; \ + cfg.tile_colsb[T_nxm] = m * 2; \ + } \ + _tile_loadconfig(&cfg); + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + BLASLONG i, j; + IFLOAT *aoffset, *boffset; + IFLOAT *aoffset0; + + aoffset = a; + boffset = b; + + BLASLONG n16 = n & ~15; + BLASLONG m32 = m & ~31; + BLASLONG m2 = m & ~1; + + BLASLONG tail_m = m2 - m32; + BLASLONG tail_n = n - n16; + tilecfg cfg; + TCONF(cfg, tail_m, tail_n); + + for (j = 0; j < n16; j += 16) { + aoffset0 = aoffset; + for (i = 0; i < m32; i += 32) { + _tile_loadd(T_16x32, aoffset0, lda * 2); + _tile_stored(T_16x32, boffset, 32 * 2); + aoffset0 += 32; + boffset += 32 * 16; + } + if (i < m2) { + _tile_loadd(T_16xm, aoffset0, lda * 2); + _tile_stored(T_16xm, boffset, tail_m * 2); + aoffset0 += tail_m; + boffset += tail_m * 16; + i = m2; + } + if (i < m) { + /* the tail odd k should put alone */ + for (int ii = 0; ii < 16; ii++) { + *(boffset + ii) = *(aoffset0 + lda * ii); + } + boffset += 16; + } + aoffset += 16 * lda; + } + if (j < n) { + aoffset0 = aoffset; + for (i = 0; i < m32; i += 32) { + _tile_loadd(T_nx32, aoffset0, lda * 2); + _tile_stored(T_nx32, boffset, 32 * 2); + aoffset0 += 32; + boffset += 32 * tail_n; + } + if (i < m2) { + _tile_loadd(T_nxm, aoffset0, lda * 2); + _tile_stored(T_nxm, boffset, tail_m * 2); + aoffset0 += tail_m; + boffset += tail_m * tail_n; + } + if (i < m) { + for (int ii = 0; ii < tail_n; ii++) { + *(boffset + ii) = *(aoffset0 + lda * ii); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_otcopy_16_spr.c b/kernel/x86_64/sbgemm_otcopy_16_spr.c new file mode 100644 index 000000000..b5d5d38fb --- /dev/null +++ b/kernel/x86_64/sbgemm_otcopy_16_spr.c @@ -0,0 +1,302 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include "common.h" + +#define LOAD_A_8VEC(aptr) \ + r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \ + r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \ + r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \ + r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \ + r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \ + r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \ + r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \ + r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7)); + +#define MASK_LOAD_A_8VEC(aptr) \ + r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \ + r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \ + r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \ + r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \ + r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \ + r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \ + r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \ + r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7)); + +#define SWITCH_LOAD_A_8VEC(aptr, cond) \ + switch((cond)) { \ + case 8: r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7)); \ + case 7: r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \ + case 6: r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \ + case 5: r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \ + case 4: r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \ + case 3: r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \ + case 2: r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \ + case 1: r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \ + } + +#define SWITCH_MASK_LOAD_A_8VEC(aptr, cond) \ + switch((cond)) { \ + case 8: r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7)); \ + case 7: r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \ + case 6: r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \ + case 5: r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \ + case 4: r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \ + case 3: r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \ + case 2: r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \ + case 1: r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \ + } + +#define REORDER_8x16(t0, t1, t2, t3, t4, t5, t6, t7) \ + t0 = _mm256_unpacklo_epi16(r0, r1); \ + t1 = _mm256_unpackhi_epi16(r0, r1); \ + t2 = _mm256_unpacklo_epi16(r2, r3); \ + t3 = _mm256_unpackhi_epi16(r2, r3); \ + t4 = _mm256_unpacklo_epi16(r4, r5); \ + t5 = _mm256_unpackhi_epi16(r4, r5); \ + t6 = _mm256_unpacklo_epi16(r6, r7); \ + t7 = _mm256_unpackhi_epi16(r6, r7); \ + r0 = _mm256_unpacklo_epi32(t0, t2); \ + r1 = _mm256_unpacklo_epi32(t1, t3); \ + r2 = _mm256_unpacklo_epi32(t4, t6); \ + r3 = _mm256_unpacklo_epi32(t5, t7); \ + r4 = _mm256_unpackhi_epi32(t0, t2); \ + r5 = _mm256_unpackhi_epi32(t1, t3); \ + r6 = _mm256_unpackhi_epi32(t4, t6); \ + r7 = _mm256_unpackhi_epi32(t5, t7); \ + t0 = _mm256_unpacklo_epi64(r0, r2); \ + t1 = _mm256_unpackhi_epi64(r0, r2); \ + t2 = _mm256_unpacklo_epi64(r4, r6); \ + t3 = _mm256_unpackhi_epi64(r4, r6); \ + t4 = _mm256_unpacklo_epi64(r1, r3); \ + t5 = _mm256_unpackhi_epi64(r1, r3); \ + t6 = _mm256_unpacklo_epi64(r5, r7); \ + t7 = _mm256_unpackhi_epi64(r5, r7); + +#define STORE_256_LO(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \ + _mm256_storeu_si256((__m256i *)(boffset + x*32), v); + +#define STORE_256_HI(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \ + _mm256_storeu_si256((__m256i *)(boffset + (x + 8)*32), v); + +#define MASK_STORE_256_LO(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \ + _mm256_mask_storeu_epi16(boffset + x*m_load, mmask, v); + +#define MASK_STORE_256_HI(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \ + _mm256_mask_storeu_epi16(boffset + (x + 8)*m_load, mmask, v); + +#define STORE_256(x, y) {\ + __m256i v; \ + if (x == 0) { STORE_256_LO(y); } \ + else { STORE_256_HI(y); } \ +} + +#define MASK_STORE_256(x, y) {\ + __m256i v; \ + if (x == 0) { MASK_STORE_256_LO(y); } \ + else { MASK_STORE_256_HI(y); } \ +} + +#define SWITCH_STORE_16x(cond, func) \ + switch((cond)) {\ + case 15: func(1, 6); \ + case 14: func(1, 5); \ + case 13: func(1, 4); \ + case 12: func(1, 3); \ + case 11: func(1, 2); \ + case 10: func(1, 1); \ + case 9: func(1, 0); \ + case 8: func(0, 7); \ + case 7: func(0, 6); \ + case 6: func(0, 5); \ + case 5: func(0, 4); \ + case 4: func(0, 3); \ + case 3: func(0, 2); \ + case 2: func(0, 1); \ + case 1: func(0, 0); \ + } + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *aoffset, *boffset; + IFLOAT *aoffset00, *aoffset01, *aoffset10, *aoffset11; + IFLOAT *boffset0; + + __m256i r0, r1, r2, r3, r4, r5, r6, r7; + __m256i t00, t01, t02, t03, t04, t05, t06, t07; + __m256i t10, t11, t12, t13, t14, t15, t16, t17; + + aoffset = a; + boffset = b; + BLASLONG n_count = n; + BLASLONG m_count = m; + for (; n_count > 15; n_count -= 16) { + aoffset00 = aoffset; + aoffset01 = aoffset00 + 8 * lda; + aoffset10 = aoffset01 + 8 * lda; + aoffset11 = aoffset10 + 8 * lda; + aoffset += 16; + m_count = m; + for (; m_count > 31; m_count -= 32) { + // first 16 rows + LOAD_A_8VEC(aoffset00); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + LOAD_A_8VEC(aoffset01); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3); + STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7); + STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3); + STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7); + // last 16 rows + boffset += 16; + LOAD_A_8VEC(aoffset10); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + LOAD_A_8VEC(aoffset11); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3); + STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7); + STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3); + STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7); + aoffset00 += 32 * lda; + aoffset01 += 32 * lda; + aoffset10 += 32 * lda; + aoffset11 += 32 * lda; + boffset += 31 * 16; + } + if (m_count > 1) { + int m_load = m_count & ~1; + m_count -= m_load; + __mmask16 mmask; + SWITCH_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 8) { + SWITCH_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + int this_load = m_load > 16 ? 16 : m_load; + mmask = (1UL << this_load) - 1; + MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3); + MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7); + MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3); + MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7); + boffset0 = boffset; + if (m_load > 16) { + boffset += this_load; + SWITCH_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 24) { + SWITCH_LOAD_A_8VEC(aoffset11, m_load - 24); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + this_load = m_load - 16; + mmask = (1UL << this_load) - 1; + MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3); + MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7); + MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3); + MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7); + } + boffset = boffset0 + 16 * m_load; + aoffset00 += m_load * lda; + } + if (m_count > 0) { + // just copy lask K to B directly + r0 = _mm256_loadu_si256((__m256i *)(aoffset00)); + _mm256_storeu_si256((__m256i *)(boffset), r0); + boffset += 16; + } + } + if (n_count > 0) { + __mmask16 nmask = (1UL << n_count) - 1; + aoffset00 = aoffset; + aoffset01 = aoffset00 + 8 * lda; + aoffset10 = aoffset01 + 8 * lda; + aoffset11 = aoffset10 + 8 * lda; + m_count = m; + for (; m_count > 31; m_count -= 32) { + // first 16 rows + MASK_LOAD_A_8VEC(aoffset00); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + MASK_LOAD_A_8VEC(aoffset01); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + SWITCH_STORE_16x(n_count, STORE_256); + // last 16 rows + boffset0 = boffset; + boffset += 16; + MASK_LOAD_A_8VEC(aoffset10); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + MASK_LOAD_A_8VEC(aoffset11); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + SWITCH_STORE_16x(n_count, STORE_256); + aoffset00 += 32 * lda; + aoffset01 += 32 * lda; + aoffset10 += 32 * lda; + aoffset11 += 32 * lda; + boffset = 32 * n_count + boffset0; + } + if (m_count > 1) { + int m_load = m_count & ~1; + m_count -= m_load; + __mmask16 mmask; + SWITCH_MASK_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 8) { + SWITCH_MASK_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + int this_load = m_load > 16 ? 16 : m_load; + mmask = (1UL << this_load) - 1; + SWITCH_STORE_16x(n_count, MASK_STORE_256); + boffset0 = boffset; + if (m_load > 16) { + boffset += this_load; + SWITCH_MASK_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 24) { + SWITCH_MASK_LOAD_A_8VEC(aoffset11, m_load - 24); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + this_load = m_load - 16; + mmask = (1UL << this_load) - 1; + SWITCH_STORE_16x(n_count, MASK_STORE_256); + } + boffset = boffset0 + n_count * m_load; + aoffset00 += m_load * lda; + } + if (m_count > 0) { + // just copy lask K to B directly + r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aoffset00)); + _mm256_mask_storeu_epi16((__m256i *)(boffset), nmask, r0); + boffset += 16; + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_spr.c b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c new file mode 100644 index 000000000..98d8ca06a --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include "sbgemm_block_microk_cooperlake.c" +// Define micro kernels for ALPHA not ONE scenarios +#undef ONE_ALPHA +#include "sbgemm_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE scenarios +#define ONE_ALPHA 1 +#include "sbgemm_microk_cooperlake_template.c" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + return 0; +} diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c index 18e64dc3f..08ccace61 100644 --- a/kernel/x86_64/sbgemv_n.c +++ b/kernel/x86_64/sbgemv_n.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sbgemv_n_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c index 22b099116..51ea0d937 100644 --- a/kernel/x86_64/sbgemv_t.c +++ b/kernel/x86_64/sbgemv_t.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sbgemv_t_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index e816c67e9..a0acea9d1 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index 2588289d1..badeb0fbf 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,7 +1,7 @@ /* the direct sgemm code written by Arjan van der Ven */ #include "common.h" -#if defined(SKYLAKEX) || defined (COOPERLAKE) +#if defined(SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index e0778006f..621ddc622 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_sandy-4.c" #elif defined(HASWELL) || defined(ZEN) #include "sgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sgemv_n_microk_haswell-4.c" #include "sgemv_n_microk_skylakex-8.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index a36c8ace9..0be2c7e97 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_sandy-4.c" #elif defined(HASWELL) || defined(ZEN) #include "sgemv_t_microk_haswell-4.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sgemv_t_microk_haswell-4.c" #include "sgemv_t_microk_skylakex.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index c9d698eb7..29d6a9958 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 4d8aac1ab..02bbc1c64 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index fea4fc746..55780734f 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index b853ef365..77331d95f 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index bad367e91..b61182303 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 147201751..99bc07d50 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/tobf16.c b/kernel/x86_64/tobf16.c index 3d1796621..a88fdcc2e 100644 --- a/kernel/x86_64/tobf16.c +++ b/kernel/x86_64/tobf16.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#if defined(COOPERLAKE) +#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #if defined(DOUBLE) #include "dtobf16_microk_cooperlake.c" #elif defined(SINGLE) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 25e9f6d42..8786870bd 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 1bc785ac1..50c8a2678 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 1f9d41859..2d6866a78 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 34f28b224..c2791e0f3 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 09a702a81..3744c98bb 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 83ed41ba1..df190c64c 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 7ed2faf0f..bfe0cf7ee 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) @@ -451,7 +451,6 @@ #endif MOVDDUP(4 * SIZE, A1, a1) - MOVDDUP(6 * SIZE, A2, a2) movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 @@ -471,7 +470,9 @@ subq IS, I subq $2, I sarq $2, I - jle .L15 + jle .L14 + + MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) ALIGN_3 .L12: @@ -632,6 +633,16 @@ jg .L12 ALIGN_3 +.L14: + movq M, I + subq IS, I + subq $2, I + testq $2, I + jle .L16 + + MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) + jmp .L15_pastcheck + .L15: movq M, I subq IS, I @@ -639,6 +650,7 @@ testq $2, I jle .L16 +.L15_pastcheck: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 5945f3f81..13176ce9c 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 484d74f14..1657885c0 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 48770fa7a..8dd2a7461 100644 --- a/param.h +++ b/param.h @@ -1669,14 +1669,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_M 16 +#else +#define DGEMM_DEFAULT_UNROLL_M 4 +#endif #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_N 2 +#else +#define DGEMM_DEFAULT_UNROLL_N 8 +#endif #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 @@ -1751,6 +1759,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef SAPPHIRERAPIDS + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#else +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 +#endif +#define USE_SGEMM_KERNEL_DIRECT 1 + +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +// FIXME: actually UNROLL_M = UNROLL_N = 16 +// If M and N is equal, OpenBLAS will reuse OCOPY as ICOPY. +// But for AMX, they are not the same, set UNROLL_M = 32 to workaround +#define SBGEMM_DEFAULT_UNROLL_N 16 +#define SBGEMM_DEFAULT_UNROLL_M 32 +#define SBGEMM_DEFAULT_P 256 +#define SBGEMM_DEFAULT_Q 1024 +#define SBGEMM_DEFAULT_R sbgemm_r + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 192 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 8640 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 + +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif +#endif + #ifdef COOPERLAKE #define SNUMOPT 16 @@ -2465,7 +2606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 -#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#if defined(HAVE_GAS) && (HAVE_GAS == 1) #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #else @@ -2756,7 +2897,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#ifdef HAVE_MSA +#if defined(HAVE_MSA) && !defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2977,7 +3118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) || defined(EMAG8180) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2994,7 +3135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 @@ -3026,7 +3167,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -3166,13 +3307,52 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) || defined(A64FX) + +/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 8 +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If SVE size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 @@ -3197,6 +3377,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #endif /* Cores */ + #endif /* ARMv8 */ #if defined(ARMV5) @@ -3474,6 +3655,20 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define XGEMM_DEFAULT_UNROLL_M 1 #endif +#ifdef ARCH_MIPS +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 +#else #define SGEMM_DEFAULT_P sgemm_p #define DGEMM_DEFAULT_P dgemm_p #define QGEMM_DEFAULT_P qgemm_p @@ -3494,6 +3689,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128 +#endif #define SYMV_P 16