Merge branch 'develop' of https://github.com/openmathlib/openblas into develop
This commit is contained in:
commit
edb7ab5ccf
|
@ -42,6 +42,7 @@ jobs:
|
||||||
- name: Install Dependencies
|
- name: Install Dependencies
|
||||||
run: |
|
run: |
|
||||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
sudo apt-get update
|
||||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||||
|
|
|
@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d
|
||||||
|
|
||||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||||
|
|
||||||
|
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
|
||||||
|
|
||||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||||
|
|
||||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||||
|
@ -328,7 +330,7 @@ if (NOT NOFORTRAN)
|
||||||
# Build test and ctest
|
# Build test and ctest
|
||||||
add_subdirectory(test)
|
add_subdirectory(test)
|
||||||
endif()
|
endif()
|
||||||
if (BUILD_TESTING)
|
if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
|
||||||
add_subdirectory(lapack-netlib/TESTING)
|
add_subdirectory(lapack-netlib/TESTING)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -458,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (BUILD_BENCHMARKS)
|
||||||
|
#find_package(OpenMP REQUIRED)
|
||||||
|
file(GLOB SOURCES "benchmark/*.c")
|
||||||
|
if (NOT USE_OPENMP)
|
||||||
|
file(GLOB REMFILE "benchmark/smallscaling.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
endif()
|
||||||
|
if (BUILD_WITHOUT_LAPACK)
|
||||||
|
file(GLOB REMFILE "benchmark/cholesky.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/geev.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/gesv.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/getri.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/potrf.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/spmv.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/symv.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/linpack.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
endif()
|
||||||
|
if (NOT USE_GEMM3M)
|
||||||
|
file(GLOB REMFILE "benchmark/gemm3m.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
endif()
|
||||||
|
foreach(source ${SOURCES})
|
||||||
|
get_filename_component(name ${source} NAME_WE)
|
||||||
|
if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
|
||||||
|
set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
|
||||||
|
foreach(define ${defines})
|
||||||
|
set(target_name "benchmark_${name}")
|
||||||
|
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||||
|
string(JOIN "_" define_str ${define})
|
||||||
|
set(target_name "${target_name}_${define_str}")
|
||||||
|
endif()
|
||||||
|
if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
|
||||||
|
(NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
|
||||||
|
(NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
|
||||||
|
(NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
|
||||||
|
add_executable(${target_name} ${source})
|
||||||
|
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
|
||||||
|
# target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
|
||||||
|
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||||
|
target_compile_definitions(${target_name} PRIVATE ${define})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
# Install project
|
# Install project
|
||||||
|
|
|
@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX
|
||||||
LIBNAMEPREFIX =
|
LIBNAMEPREFIX =
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
SYMPREFIX=$(SYMBOLPREFIX)
|
||||||
|
ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
|
||||||
|
SYMPREFIX=
|
||||||
|
endif
|
||||||
|
SYMSUFFIX=$(SYMBOLSUFFIX)
|
||||||
|
ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
|
||||||
|
SYMSUFFIX=
|
||||||
|
endif
|
||||||
ifndef LIBNAMESUFFIX
|
ifndef LIBNAMESUFFIX
|
||||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
|
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
|
||||||
else
|
else
|
||||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX)
|
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
|
|
|
@ -88,6 +88,17 @@ if (NOT NOFORTRAN)
|
||||||
auxiliary.c
|
auxiliary.c
|
||||||
c_xerbla.c
|
c_xerbla.c
|
||||||
constant.c)
|
constant.c)
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
add_executable(x${float_char}cblat3_3m
|
||||||
|
c_${float_char}blat3_3m.f
|
||||||
|
c_${float_char}blas3_3m.c
|
||||||
|
c_${float_char}3chke_3m.c
|
||||||
|
auxiliary.c
|
||||||
|
c_xerbla.c
|
||||||
|
constant.c)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
add_executable(x${float_char}cblat3
|
add_executable(x${float_char}cblat3
|
||||||
c_${float_char}blat3c.c
|
c_${float_char}blat3c.c
|
||||||
|
@ -96,6 +107,17 @@ else()
|
||||||
auxiliary.c
|
auxiliary.c
|
||||||
c_xerbla.c
|
c_xerbla.c
|
||||||
constant.c)
|
constant.c)
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
add_executable(x${float_char}cblat3_3m
|
||||||
|
c_${float_char}blat3c_3m.c
|
||||||
|
c_${float_char}blas3_3m.c
|
||||||
|
c_${float_char}3chke_3m.c
|
||||||
|
auxiliary.c
|
||||||
|
c_xerbla.c
|
||||||
|
constant.c)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||||
|
@ -105,7 +127,24 @@ endif()
|
||||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||||
target_link_libraries(x${float_char}cblat3 m)
|
target_link_libraries(x${float_char}cblat3 m)
|
||||||
endif()
|
endif()
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
|
||||||
|
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||||
|
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||||
|
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||||
|
endif()
|
||||||
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||||
|
target_link_libraries(x${float_char}cblat3_3m m)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
add_test(NAME "x${float_char}cblat3"
|
add_test(NAME "x${float_char}cblat3"
|
||||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
add_test(NAME "x${float_char}cblat3_3m"
|
||||||
|
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
|
@ -5,6 +5,24 @@
|
||||||
TOPDIR = ..
|
TOPDIR = ..
|
||||||
include $(TOPDIR)/Makefile.system
|
include $(TOPDIR)/Makefile.system
|
||||||
|
|
||||||
|
SUPPORT_GEMM3M = 0
|
||||||
|
|
||||||
|
ifeq ($(ARCH), x86)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), x86_64)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), ia64)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), MIPS)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||||
ifeq ($(F_COMPILER),GFORTRAN)
|
ifeq ($(F_COMPILER),GFORTRAN)
|
||||||
override FFLAGS += -fno-tree-vectorize
|
override FFLAGS += -fno-tree-vectorize
|
||||||
|
@ -144,9 +162,15 @@ all3targets += xdcblat3
|
||||||
endif
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX),1)
|
ifeq ($(BUILD_COMPLEX),1)
|
||||||
all3targets += xccblat3
|
all3targets += xccblat3
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
all3targets += xccblat3_3m
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX16),1)
|
ifeq ($(BUILD_COMPLEX16),1)
|
||||||
all3targets += xzcblat3
|
all3targets += xzcblat3
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
all3targets += xzcblat3_3m
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all3: $(all3targets)
|
all3: $(all3targets)
|
||||||
|
@ -181,9 +205,9 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all3_3m: xzcblat3_3m xccblat3_3m
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
ifeq ($(BUILD_SINGLE),1)
|
ifeq ($(BUILD_COMPLEX),1)
|
||||||
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
||||||
endif
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX16),1)
|
ifeq ($(BUILD_COMPLEX16),1)
|
||||||
|
@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1)
|
||||||
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -271,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
|
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
|
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
@ -280,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
|
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -293,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
|
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
|
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
@ -302,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
|
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S
|
||||||
CSUMKERNEL = csum_lsx.S
|
CSUMKERNEL = csum_lsx.S
|
||||||
ZSUMKERNEL = csum_lsx.S
|
ZSUMKERNEL = csum_lsx.S
|
||||||
|
|
||||||
|
SGEMVNKERNEL = sgemv_n_lsx.S
|
||||||
|
SGEMVTKERNEL = sgemv_t_lsx.S
|
||||||
|
|
||||||
|
DGEMVNKERNEL = dgemv_n_lsx.S
|
||||||
|
DGEMVTKERNEL = dgemv_t_lsx.S
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_8x4.S
|
DGEMMKERNEL = dgemm_kernel_8x4.S
|
||||||
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
|
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
|
||||||
DGEMMITCOPY = dgemm_tcopy_8_lsx.S
|
DGEMMITCOPY = dgemm_tcopy_8_lsx.S
|
||||||
|
@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CGEMVNKERNEL = cgemv_n_4_lsx.S
|
||||||
|
CGEMVTKERNEL = cgemv_t_4_lsx.S
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_8x4_lsx.S
|
CGEMMKERNEL = cgemm_kernel_8x4_lsx.S
|
||||||
CGEMMINCOPY = cgemm_ncopy_8_lsx.S
|
CGEMMINCOPY = cgemm_ncopy_8_lsx.S
|
||||||
CGEMMITCOPY = cgemm_tcopy_8_lsx.S
|
CGEMMITCOPY = cgemm_tcopy_8_lsx.S
|
||||||
|
@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZGEMVNKERNEL = zgemv_n_2_lsx.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t_2_lsx.S
|
||||||
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S
|
ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S
|
||||||
ZGEMMONCOPY = zgemm_ncopy_4_lsx.S
|
ZGEMMONCOPY = zgemm_ncopy_4_lsx.S
|
||||||
ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S
|
ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S
|
||||||
|
|
|
@ -0,0 +1,323 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "loongarch64_asm.S"
|
||||||
|
|
||||||
|
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||||
|
*/
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define ALPHA_R $f0
|
||||||
|
#define ALPHA_I $f1
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INC_X $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INC_Y $r6
|
||||||
|
|
||||||
|
#define J $r12
|
||||||
|
#define I $r13
|
||||||
|
#define K $r14
|
||||||
|
#define Y_ORG $r15
|
||||||
|
#define OFFSET $r16
|
||||||
|
#define K_LDA $r17
|
||||||
|
#define M8 $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define PA0 $r20
|
||||||
|
#define PA1 $r23
|
||||||
|
#define PA2 $r24
|
||||||
|
#define PA3 $r25
|
||||||
|
#define PA4 $r26
|
||||||
|
#define PA5 $r27
|
||||||
|
#define PA6 $r28
|
||||||
|
#define PA7 $r29
|
||||||
|
|
||||||
|
#define VALPHA $vr1
|
||||||
|
#define X0 $vr2
|
||||||
|
#define X1 $vr3
|
||||||
|
#define X2 $vr4
|
||||||
|
#define X3 $vr5
|
||||||
|
#define X4 $vr6
|
||||||
|
#define X5 $vr7
|
||||||
|
#define X6 $vr8
|
||||||
|
#define X7 $vr9
|
||||||
|
#define Y0 $vr10
|
||||||
|
#define Y1 $vr11
|
||||||
|
#define A0 $vr12
|
||||||
|
#define A1 $vr13
|
||||||
|
#define A2 $vr14
|
||||||
|
#define A3 $vr15
|
||||||
|
#define A4 $vr16
|
||||||
|
#define A5 $vr17
|
||||||
|
#define A6 $vr18
|
||||||
|
#define A7 $vr19
|
||||||
|
#define A8 $vr20
|
||||||
|
#define A9 $vr21
|
||||||
|
#define A10 $vr22
|
||||||
|
#define A11 $vr23
|
||||||
|
#define A12 $vr24
|
||||||
|
#define A13 $vr25
|
||||||
|
#define A14 $vr26
|
||||||
|
#define A15 $vr27
|
||||||
|
#define TMP0 $vr28
|
||||||
|
#define TMP1 $vr29
|
||||||
|
#define TMP2 $vr30
|
||||||
|
|
||||||
|
#if !defined(CONJ)
|
||||||
|
#if !defined(XCONJ)
|
||||||
|
#define GXCONJ 0
|
||||||
|
#define GCONJ 0
|
||||||
|
#else
|
||||||
|
#define GXCONJ 1
|
||||||
|
#define GCONJ 0
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#if !defined(XCONJ)
|
||||||
|
#define GXCONJ 0
|
||||||
|
#define GCONJ 1
|
||||||
|
#else
|
||||||
|
#define GXCONJ 1
|
||||||
|
#define GCONJ 1
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.macro CLOAD_X_4
|
||||||
|
GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
|
||||||
|
GCOMPLEXMUL GXCONJ, \
|
||||||
|
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
|
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
|
||||||
|
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
|
||||||
|
X3, VALPHA, X3, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CLOAD_X_4_GAP
|
||||||
|
vldrepl.d X0, X, 0x00
|
||||||
|
PTR_ADD T0, X, INC_X
|
||||||
|
vldrepl.d X1, T0, 0x00
|
||||||
|
PTR_ADD T0, T0, INC_X
|
||||||
|
vldrepl.d X2, T0, 0x00
|
||||||
|
PTR_ADD T0, T0, INC_X
|
||||||
|
vldrepl.d X3, T0, 0x00
|
||||||
|
|
||||||
|
GCOMPLEXMUL GXCONJ, \
|
||||||
|
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
|
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
|
||||||
|
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
|
||||||
|
X3, VALPHA, X3, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CLOAD_X_1
|
||||||
|
GLDREPL v, d, X0, X, 0x00
|
||||||
|
GCOMPLEXMUL GXCONJ, \
|
||||||
|
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CLOAD_Y_4
|
||||||
|
GLD v, , Y0, Y, 0, Y1, Y, 0x10
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CLOAD_Y_4_GAP
|
||||||
|
fld.d $f10, Y, 0
|
||||||
|
fldx.d $f13, Y, INC_Y
|
||||||
|
PTR_ALSL T0, INC_Y, Y, 1
|
||||||
|
fld.d $f11, T0, 0
|
||||||
|
fldx.d $f17, T0, INC_Y
|
||||||
|
vpackev.d Y0, A1, Y0
|
||||||
|
vpackev.d Y1, A5, Y1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CLOAD_Y_1
|
||||||
|
fld.d $f10, Y, 0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CSTORE_Y_4
|
||||||
|
GST v, , Y0, Y, 0, Y1, Y, 0x10
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CSTORE_Y_4_GAP
|
||||||
|
vstelm.d Y0, Y, 0, 0
|
||||||
|
PTR_ADD T0, Y, INC_Y
|
||||||
|
vstelm.d Y0, T0, 0, 1
|
||||||
|
PTR_ADD T0, T0, INC_Y
|
||||||
|
vstelm.d Y1, T0, 0, 0
|
||||||
|
PTR_ADD T0, T0, INC_Y
|
||||||
|
vstelm.d Y1, T0, 0, 1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CSTORE_Y_1
|
||||||
|
fst.d $f10, Y, 0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CGEMV_N_4x4
|
||||||
|
GLD_INC v, , 0x10, \
|
||||||
|
A0, PA0, 0, A1, PA0, 0, \
|
||||||
|
A2, PA1, 0, A3, PA1, 0, \
|
||||||
|
A4, PA2, 0, A5, PA2, 0, \
|
||||||
|
A6, PA3, 0, A7, PA3, 0
|
||||||
|
|
||||||
|
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||||
|
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CGEMV_N_1x4
|
||||||
|
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
|
||||||
|
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||||
|
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X3, A6, Y0, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CGEMV_N_1x1
|
||||||
|
fld.d $f12, PA0, 0
|
||||||
|
PTR_ADDI PA0, PA0, 0x08
|
||||||
|
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||||
|
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
|
||||||
|
PTR_SRLI J, N, 2
|
||||||
|
beqz J, .L_\XW\()_N_3
|
||||||
|
PTR_SLLI K_LDA, LDA, 2
|
||||||
|
PTR_SUB K_LDA, K_LDA, M8
|
||||||
|
.L_\XW\()_N_L4:
|
||||||
|
CLOAD_\X_4
|
||||||
|
xor K, K, K
|
||||||
|
move Y, Y_ORG
|
||||||
|
PTR_SRLI I, M, 2
|
||||||
|
beqz I, .L_\XW\()_M_3
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L4:
|
||||||
|
CLOAD_\Y_4
|
||||||
|
CGEMV_N_4x4
|
||||||
|
CSTORE_\Y_4
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ALSL Y, INC_Y, Y, 2
|
||||||
|
PTR_ADDI K, K, 4
|
||||||
|
bnez I, .L_\XW\()_M_L4
|
||||||
|
.L_\XW\()_M_3:
|
||||||
|
andi I, M, 3
|
||||||
|
beqz I, .L_\XW\()_M_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L1:
|
||||||
|
CLOAD_\Y_1
|
||||||
|
CGEMV_N_1x4
|
||||||
|
CSTORE_\Y_1
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD Y, Y, INC_Y
|
||||||
|
PTR_ADDI K, K, 1
|
||||||
|
bnez I, .L_\XW\()_M_L1
|
||||||
|
.L_\XW\()_M_END:
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||||
|
#endif
|
||||||
|
PTR_ALSL X, INC_X, X, 2
|
||||||
|
bnez J, .L_\XW\()_N_L4
|
||||||
|
.L_\XW\()_N_3:
|
||||||
|
andi J, N, 3
|
||||||
|
beqz J, .L_END
|
||||||
|
.L_\XW\()_N_L1:
|
||||||
|
CLOAD_\X_1
|
||||||
|
xor K, K, K
|
||||||
|
move Y, Y_ORG
|
||||||
|
move I, M
|
||||||
|
beqz I, .L_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_N_1_M_L1:
|
||||||
|
CLOAD_\Y_1
|
||||||
|
CGEMV_N_1x1
|
||||||
|
CSTORE_\Y_1
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD Y, Y, INC_Y
|
||||||
|
PTR_ADDI K, K, 1
|
||||||
|
bnez I, .L_\XW\()_N_1_M_L1
|
||||||
|
.L_\XW\()_N_1_M_END:
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
PTR_SUB K_LDA, LDA, M8
|
||||||
|
PTR_ADD PA0, PA0, K_LDA
|
||||||
|
PTR_ADD X, X, INC_X
|
||||||
|
bnez J, .L_\XW\()_N_L1
|
||||||
|
|
||||||
|
b .L_END
|
||||||
|
.endm
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PTR_LD INC_Y, $sp, 0
|
||||||
|
push_if_used 17 + 7, 31
|
||||||
|
PTR_ADDI K, $r0, 0x01
|
||||||
|
PTR_SUB I, INC_X, K
|
||||||
|
PTR_SUB J, INC_Y, K
|
||||||
|
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||||
|
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
|
||||||
|
PTR_ALSL I, I, J, 1
|
||||||
|
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||||
|
// Init VALPHA
|
||||||
|
vpackev.w $vr0, $vr1, $vr0
|
||||||
|
vpackev.d VALPHA, $vr0, $vr0
|
||||||
|
move Y_ORG, Y
|
||||||
|
move PA0, A
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||||
|
#endif
|
||||||
|
la.local T0, .L_GAP_TABLE
|
||||||
|
PTR_ALSL I, I, T0, 1
|
||||||
|
ld.h K, I, 0 // Obtain the offset address
|
||||||
|
PTR_ADD T0, T0, K
|
||||||
|
jirl $r0, T0, 0
|
||||||
|
.L_GAP_TABLE:
|
||||||
|
.hword .L_GAP_0_0 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_0_1 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_1_0 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_1_1 - .L_GAP_TABLE
|
||||||
|
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
|
||||||
|
CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1
|
||||||
|
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
|
||||||
|
CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
|
||||||
|
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
|
||||||
|
CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
|
||||||
|
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||||
|
CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
|
||||||
|
.L_END:
|
||||||
|
pop_if_used 17 + 7, 31
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
EPILOGUE
|
|
@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
|
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
|
||||||
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
|
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
|
||||||
GCOMPLEXMUL GXCONJ, \
|
GCOMPLEXMUL GXCONJ, \
|
||||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
|
||||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
|
||||||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
|
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
|
||||||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
|
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
|
||||||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
|
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
|
||||||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
|
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
|
||||||
X7, X7, VALPHA, TMP0, TMP1, TMP2
|
X7, VALPHA, X7, TMP0, TMP1, TMP2
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro CLOAD_X_8_GAP
|
.macro CLOAD_X_8_GAP
|
||||||
|
@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvldrepl.d X7, T0, 0x00
|
xvldrepl.d X7, T0, 0x00
|
||||||
|
|
||||||
GCOMPLEXMUL GXCONJ, \
|
GCOMPLEXMUL GXCONJ, \
|
||||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
|
||||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
|
||||||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
|
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
|
||||||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
|
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
|
||||||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
|
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
|
||||||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
|
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
|
||||||
X7, X7, VALPHA, TMP0, TMP1, TMP2
|
X7, VALPHA, X7, TMP0, TMP1, TMP2
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro CLOAD_Y_8
|
.macro CLOAD_Y_8
|
||||||
|
@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro CLOAD_X_1
|
.macro CLOAD_X_1
|
||||||
GLDREPL xv, d, X0, X, 0x00
|
GLDREPL xv, d, X0, X, 0x00
|
||||||
GCOMPLEXMUL GXCONJ, \
|
GCOMPLEXMUL GXCONJ, \
|
||||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
|
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro CLOAD_Y_1
|
.macro CLOAD_Y_1
|
||||||
|
|
|
@ -0,0 +1,290 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "loongarch64_asm.S"
|
||||||
|
|
||||||
|
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||||
|
*/
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define ALPHA_R $f0
|
||||||
|
#define ALPHA_I $f1
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INC_X $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INC_Y $r6
|
||||||
|
|
||||||
|
#define J $r12
|
||||||
|
#define I $r13
|
||||||
|
#define K $r14
|
||||||
|
#define PY0 $r14
|
||||||
|
#define X_ORG $r15
|
||||||
|
#define PY1 $r16
|
||||||
|
#define K_LDA $r17
|
||||||
|
#define PY2 $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define PA0 $r20
|
||||||
|
#define PA1 $r23
|
||||||
|
#define PA2 $r24
|
||||||
|
#define PA3 $r25
|
||||||
|
#define PA4 $r26
|
||||||
|
#define PA5 $r27
|
||||||
|
#define PA6 $r28
|
||||||
|
#define PA7 $r29
|
||||||
|
#define M8 $r30
|
||||||
|
|
||||||
|
#define VALPHA $vr0
|
||||||
|
#define X0 $vr1
|
||||||
|
#define X1 $vr2
|
||||||
|
#define A0 $vr3
|
||||||
|
#define A1 $vr4
|
||||||
|
#define A2 $vr5
|
||||||
|
#define A3 $vr6
|
||||||
|
#define A4 $vr7
|
||||||
|
#define A5 $vr8
|
||||||
|
#define A6 $vr9
|
||||||
|
#define A7 $vr10
|
||||||
|
#define A8 $vr11
|
||||||
|
#define A9 $vr12
|
||||||
|
#define A10 $vr13
|
||||||
|
#define A11 $vr14
|
||||||
|
#define A12 $vr15
|
||||||
|
#define A13 $vr16
|
||||||
|
#define A14 $vr17
|
||||||
|
#define A15 $vr18
|
||||||
|
#define TP0 $vr19
|
||||||
|
#define TP1 $vr20
|
||||||
|
#define TP2 $vr21
|
||||||
|
#define TP3 $vr22
|
||||||
|
#define TP4 $vr23
|
||||||
|
#define TP5 $vr24
|
||||||
|
#define TP6 $vr25
|
||||||
|
#define TP7 $vr26
|
||||||
|
#define TMP0 $vr27
|
||||||
|
#define TMP1 $vr28
|
||||||
|
#define TMP2 $vr29
|
||||||
|
#define Y0 $vr3
|
||||||
|
#define Y1 $vr4
|
||||||
|
#define Y2 $vr5
|
||||||
|
#define Y3 $vr6
|
||||||
|
#define Y4 $vr7
|
||||||
|
#define Y5 $vr8
|
||||||
|
#define Y6 $vr9
|
||||||
|
#define Y7 $vr10
|
||||||
|
|
||||||
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
|
#define GXCONJ1 0
|
||||||
|
#define GCONJ1 0
|
||||||
|
#else
|
||||||
|
#define GXCONJ1 1
|
||||||
|
#define GCONJ1 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(XCONJ)
|
||||||
|
#define GXCONJ2 0
|
||||||
|
#define GCONJ2 0
|
||||||
|
#else
|
||||||
|
#define GXCONJ2 0
|
||||||
|
#define GCONJ2 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.macro ZERO_Y4
|
||||||
|
GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZERO_Y1
|
||||||
|
GXOR v, v, TP0, TP0, TP0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CLOAD_X4
|
||||||
|
GLD v, , X0, X, 0x00, X1, X, 0x10
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CLOAD_X4_GAP
|
||||||
|
fld.d $f1, X, 0x00
|
||||||
|
fldx.d $f3, X, INC_X
|
||||||
|
PTR_ALSL T0, INC_X, X, 1
|
||||||
|
fld.d $f2, T0, 0x00
|
||||||
|
fldx.d $f4, T0, INC_X
|
||||||
|
vpackev.d X0, A0, X0
|
||||||
|
vpackev.d X1, A1, X1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CGEMV_T_4x4
|
||||||
|
GLD_INC v, , 0x10, \
|
||||||
|
A0, PA0, 0, A1, PA0, 0, \
|
||||||
|
A2, PA1, 0, A3, PA1, 0, \
|
||||||
|
A4, PA2, 0, A5, PA2, 0, \
|
||||||
|
A6, PA3, 0, A7, PA3, 0
|
||||||
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||||
|
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
|
||||||
|
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
|
||||||
|
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
|
||||||
|
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CGEMV_T_LSX XW:req, X4:req
|
||||||
|
PTR_SRLI J, N, 2
|
||||||
|
beqz J, .L_\XW\()_N_3
|
||||||
|
PTR_SLLI K_LDA, LDA, 2
|
||||||
|
PTR_SUB K_LDA, K_LDA, M8
|
||||||
|
.L_\XW\()_N_L4:
|
||||||
|
ZERO_Y4
|
||||||
|
move X, X_ORG
|
||||||
|
PTR_SRLI I, M, 2
|
||||||
|
beqz I, .L_\XW\()_M_3
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L4:
|
||||||
|
CLOAD_\X4
|
||||||
|
CGEMV_T_4x4
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ALSL X, INC_X, X, 2
|
||||||
|
bnez I, .L_\XW\()_M_L4
|
||||||
|
.L_\XW\()_M_3:
|
||||||
|
// Accumulated
|
||||||
|
GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
|
||||||
|
andi I, M, 3
|
||||||
|
beqz I, .L_\XW\()_M_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L1:
|
||||||
|
fld.d $f1, X, 0x00
|
||||||
|
fld.d $f11, PA0, 0x00
|
||||||
|
fld.d $f12, PA1, 0x00
|
||||||
|
fld.d $f13, PA2, 0x00
|
||||||
|
fld.d $f14, PA3, 0x00
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
|
||||||
|
#else
|
||||||
|
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||||
|
vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
|
||||||
|
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
|
||||||
|
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD X, X, INC_X
|
||||||
|
bnez I, .L_\XW\()_M_L1
|
||||||
|
.L_\XW\()_M_END:
|
||||||
|
fld.d $f11, Y, 0x00
|
||||||
|
fldx.d $f12, Y, INC_Y
|
||||||
|
PTR_ALSL PY0, INC_Y, Y, 1
|
||||||
|
fld.d $f13, PY0, 0x00
|
||||||
|
fldx.d $f14, PY0, INC_Y
|
||||||
|
|
||||||
|
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||||
|
vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
|
||||||
|
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
|
||||||
|
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||||
|
#endif
|
||||||
|
fst.d $f11, Y, 0x00
|
||||||
|
fstx.d $f12, Y, INC_Y
|
||||||
|
fst.d $f13, PY0, 0x00
|
||||||
|
fstx.d $f14, PY0, INC_Y
|
||||||
|
PTR_ALSL Y, INC_Y, Y, 2
|
||||||
|
bnez J, .L_\XW\()_N_L4
|
||||||
|
.L_\XW\()_N_3:
|
||||||
|
andi J, N, 3
|
||||||
|
beqz J, .L_END
|
||||||
|
PTR_SUB K_LDA, LDA, M8
|
||||||
|
.L_\XW\()_N_1:
|
||||||
|
ZERO_Y1
|
||||||
|
move X, X_ORG
|
||||||
|
move I, M
|
||||||
|
beqz I, .L_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_N_1_M_L1:
|
||||||
|
fld.d $f3, PA0, 0x00
|
||||||
|
fld.d $f1, X, 0x00
|
||||||
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||||
|
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD X, X, INC_X
|
||||||
|
PTR_ADDI PA0, PA0, 0x08
|
||||||
|
bnez I, .L_\XW\()_N_1_M_L1
|
||||||
|
.L_\XW\()_N_1_M_END:
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
fld.d $f3, Y, 0x00
|
||||||
|
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||||
|
vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
|
||||||
|
fst.d $f3, Y, 0x00
|
||||||
|
PTR_ADD PA0, PA0, K_LDA
|
||||||
|
PTR_ADD Y, Y, INC_Y
|
||||||
|
bnez J, .L_\XW\()_N_1
|
||||||
|
|
||||||
|
b .L_END
|
||||||
|
.endm
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PTR_LD INC_Y, $sp, 0
|
||||||
|
push_if_used 17 + 8, 30
|
||||||
|
PTR_ADDI K, $r0, 0x01
|
||||||
|
PTR_SUB I, INC_X, K
|
||||||
|
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||||
|
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||||
|
// Init VALPHA
|
||||||
|
vpackev.w $vr0, $vr1, $vr0
|
||||||
|
vpackev.d VALPHA, $vr0, $vr0
|
||||||
|
move X_ORG, X
|
||||||
|
move PA0, A
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||||
|
#endif
|
||||||
|
la.local T0, .L_GAP_TABLE
|
||||||
|
PTR_ALSL I, I, T0, 1
|
||||||
|
ld.h K, I, 0
|
||||||
|
PTR_ADD T0, T0, K
|
||||||
|
jirl $r0, T0, 0
|
||||||
|
.L_GAP_TABLE:
|
||||||
|
.hword .L_GAP_0 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_1 - .L_GAP_TABLE
|
||||||
|
.L_GAP_0: /* if (incx == 1) */
|
||||||
|
CGEMV_T_LSX GAP_0, X4
|
||||||
|
.L_GAP_1: /* if (incx != 1) */
|
||||||
|
CGEMV_T_LSX GAP_1, X4_GAP
|
||||||
|
.L_END:
|
||||||
|
pop_if_used 17 + 8, 30
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,229 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
/* Param */
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INCX $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INCY $r6
|
||||||
|
#define BUFFER $r16
|
||||||
|
#define ALPHA $f0
|
||||||
|
|
||||||
|
#define YORIG $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define T1 $r20
|
||||||
|
#define XX $r12
|
||||||
|
#define YY $r13
|
||||||
|
#define I $r14
|
||||||
|
#define J $r15
|
||||||
|
#define AO1 $r23
|
||||||
|
#define AO2 $r24
|
||||||
|
#define IX $r25
|
||||||
|
#define IY $r26
|
||||||
|
#define II $r27
|
||||||
|
#define T2 $r28
|
||||||
|
#define T3 $r29
|
||||||
|
#define T4 $r30
|
||||||
|
|
||||||
|
/* LSX vectors */
|
||||||
|
#define U0 $vr11
|
||||||
|
#define U1 $vr12
|
||||||
|
#define U2 $vr2
|
||||||
|
#define U3 $vr3
|
||||||
|
#define U4 $vr4
|
||||||
|
#define U5 $vr5
|
||||||
|
#define U6 $vr6
|
||||||
|
#define U7 $vr7
|
||||||
|
#define U8 $vr8
|
||||||
|
#define U9 $vr9
|
||||||
|
#define VALPHA $vr10
|
||||||
|
|
||||||
|
#define a1 $f3
|
||||||
|
#define a2 $f4
|
||||||
|
#define a3 $f5
|
||||||
|
#define a4 $f6
|
||||||
|
#define a5 $f7
|
||||||
|
#define a6 $f8
|
||||||
|
#define a7 $f9
|
||||||
|
#define a8 $f10
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
LDARG INCY, $sp, 0
|
||||||
|
LDARG BUFFER, $sp, 8
|
||||||
|
|
||||||
|
addi.d $sp, $sp, -80
|
||||||
|
|
||||||
|
SDARG $r23, $sp, 0
|
||||||
|
SDARG $r24, $sp, 8
|
||||||
|
SDARG $r25, $sp, 16
|
||||||
|
SDARG $r26, $sp, 32
|
||||||
|
SDARG $r27, $sp, 40
|
||||||
|
SDARG $r28, $sp, 48
|
||||||
|
SDARG $r29, $sp, 56
|
||||||
|
SDARG $r30, $sp, 64
|
||||||
|
ST ALPHA, $sp, 72
|
||||||
|
|
||||||
|
vldrepl.d VALPHA, $sp, 72
|
||||||
|
|
||||||
|
slli.d LDA, LDA, BASE_SHIFT
|
||||||
|
slli.d INCX, INCX, BASE_SHIFT
|
||||||
|
slli.d INCY, INCY, BASE_SHIFT
|
||||||
|
|
||||||
|
bge $r0, M, .L999
|
||||||
|
bge $r0, N, .L999
|
||||||
|
|
||||||
|
move J, $r0
|
||||||
|
move IX, $r0
|
||||||
|
|
||||||
|
move AO1, A //a_ptr
|
||||||
|
move XX, X
|
||||||
|
move YY, Y
|
||||||
|
|
||||||
|
beq J, M, .L999
|
||||||
|
|
||||||
|
.L01:
|
||||||
|
vldx U0, XX, IX
|
||||||
|
vshuf4i.d U0, U0, 0x00
|
||||||
|
|
||||||
|
vfmul.d U1, VALPHA, U0 //temp1
|
||||||
|
|
||||||
|
move IY, $r0
|
||||||
|
move II, $r0
|
||||||
|
move I, $r0
|
||||||
|
|
||||||
|
srai.d T0, M, 2 //n/4
|
||||||
|
beq I, T0, .L03
|
||||||
|
|
||||||
|
.L02:
|
||||||
|
vldx U2, AO1, II
|
||||||
|
addi.d II, II, 16
|
||||||
|
vldx U7, AO1, II
|
||||||
|
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
add.d T3, T2, INCY
|
||||||
|
add.d T4, T3, INCY
|
||||||
|
|
||||||
|
fldx.d a1, YY, T1
|
||||||
|
fldx.d a2, YY, T2
|
||||||
|
fldx.d a3, YY, T3
|
||||||
|
fldx.d a4, YY, T4
|
||||||
|
|
||||||
|
vextrins.d U3, U4, 0x10
|
||||||
|
vextrins.d U5, U6, 0x10
|
||||||
|
|
||||||
|
vfmadd.d U3, U1, U2, U3
|
||||||
|
vfmadd.d U5, U1, U7, U5
|
||||||
|
|
||||||
|
vextrins.d U4, U3, 0x01
|
||||||
|
vextrins.d U6, U5, 0x01
|
||||||
|
|
||||||
|
fstx.d a1, YY, T1
|
||||||
|
fstx.d a2, YY, T2
|
||||||
|
fstx.d a3, YY, T3
|
||||||
|
fstx.d a4, YY, T4
|
||||||
|
|
||||||
|
add.d IY, T4, INCY
|
||||||
|
addi.d II, II, 16
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, T0, .L02
|
||||||
|
|
||||||
|
.L03:
|
||||||
|
andi T0, M, 2
|
||||||
|
beq $r0, T0, .L04
|
||||||
|
|
||||||
|
addi.d T1, $r0, 4
|
||||||
|
mod.d T1, M, T1
|
||||||
|
sub.d II, M, T1
|
||||||
|
slli.d II, II, BASE_SHIFT
|
||||||
|
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
|
||||||
|
vldx U2, AO1, II
|
||||||
|
|
||||||
|
fldx.d a1, YY, T1
|
||||||
|
fldx.d a2, YY, T2
|
||||||
|
|
||||||
|
vextrins.d U3, U4, 0x10
|
||||||
|
|
||||||
|
vfmadd.d U3, U1, U2, U3
|
||||||
|
|
||||||
|
vextrins.d U4, U3, 0x01
|
||||||
|
|
||||||
|
fstx.d a1, YY, T1
|
||||||
|
fstx.d a2, YY, T2
|
||||||
|
|
||||||
|
add.d IY, T2, INCY
|
||||||
|
|
||||||
|
.L04:
|
||||||
|
andi T0, M, 1
|
||||||
|
beq $r0, T0, .L05
|
||||||
|
|
||||||
|
addi.d II, M, -1
|
||||||
|
slli.d II, II, BASE_SHIFT
|
||||||
|
|
||||||
|
fldx.d a1, AO1, II
|
||||||
|
fldx.d a3, YY, IY
|
||||||
|
|
||||||
|
fmadd.d a3, $f12, a1, a3
|
||||||
|
|
||||||
|
fstx.d a3, YY, IY
|
||||||
|
|
||||||
|
add.d IY, IY, INCY
|
||||||
|
|
||||||
|
.L05:
|
||||||
|
add.d AO1, AO1, LDA
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d J, J, 1
|
||||||
|
blt J, N, .L01
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
LDARG $r23, $sp, 0
|
||||||
|
LDARG $r24, $sp, 8
|
||||||
|
LDARG $r25, $sp, 16
|
||||||
|
LDARG $r26, $sp, 32
|
||||||
|
LDARG $r27, $sp, 40
|
||||||
|
LDARG $r28, $sp, 48
|
||||||
|
LDARG $r29, $sp, 56
|
||||||
|
LDARG $r30, $sp, 64
|
||||||
|
LD ALPHA, $sp, 72
|
||||||
|
addi.d $sp, $sp, 80
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,279 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
/* Param */
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INCX $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INCY $r6
|
||||||
|
#define BUFFER $r16
|
||||||
|
#define ALPHA $f0
|
||||||
|
|
||||||
|
#define YORIG $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define T1 $r20
|
||||||
|
#define AO3 $r12
|
||||||
|
#define AO4 $r13
|
||||||
|
#define I $r14
|
||||||
|
#define J $r15
|
||||||
|
#define AO1 $r23
|
||||||
|
#define AO2 $r24
|
||||||
|
#define IX $r25
|
||||||
|
#define IY $r26
|
||||||
|
#define II $r27
|
||||||
|
#define T2 $r28
|
||||||
|
#define T3 $r29
|
||||||
|
#define T4 $r30
|
||||||
|
|
||||||
|
/* LSX vectors */
|
||||||
|
#define U0 $vr11
|
||||||
|
#define U1 $vr12
|
||||||
|
#define U2 $vr2
|
||||||
|
#define U3 $vr3
|
||||||
|
#define U4 $vr4
|
||||||
|
#define U5 $vr5
|
||||||
|
#define U6 $vr6
|
||||||
|
#define U7 $vr7
|
||||||
|
#define U8 $vr8
|
||||||
|
#define U9 $vr9
|
||||||
|
#define VALPHA $vr10
|
||||||
|
|
||||||
|
#define a1 $f3
|
||||||
|
#define a2 $f4
|
||||||
|
#define a3 $f5
|
||||||
|
#define a4 $f6
|
||||||
|
#define a5 $f7
|
||||||
|
#define a6 $f8
|
||||||
|
#define a7 $f9
|
||||||
|
#define a8 $f10
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
LDARG INCY, $sp, 0
|
||||||
|
LDARG BUFFER, $sp, 8
|
||||||
|
|
||||||
|
addi.d $sp, $sp, -80
|
||||||
|
|
||||||
|
SDARG $r23, $sp, 0
|
||||||
|
SDARG $r24, $sp, 8
|
||||||
|
SDARG $r25, $sp, 16
|
||||||
|
SDARG $r26, $sp, 32
|
||||||
|
SDARG $r27, $sp, 40
|
||||||
|
SDARG $r28, $sp, 48
|
||||||
|
SDARG $r29, $sp, 56
|
||||||
|
SDARG $r30, $sp, 64
|
||||||
|
ST ALPHA, $sp, 72
|
||||||
|
|
||||||
|
vldrepl.d VALPHA, $sp, 72
|
||||||
|
|
||||||
|
slli.d LDA, LDA, BASE_SHIFT
|
||||||
|
slli.d INCX, INCX, BASE_SHIFT
|
||||||
|
slli.d INCY, INCY, BASE_SHIFT
|
||||||
|
|
||||||
|
bge $r0, M, .L999
|
||||||
|
bge $r0, N, .L999
|
||||||
|
|
||||||
|
move J, $r0
|
||||||
|
move IY, $r0
|
||||||
|
|
||||||
|
move AO1, A //a_ptr1
|
||||||
|
|
||||||
|
srai.d T0, N, 2 //n/4
|
||||||
|
beq J, T0, .L04
|
||||||
|
|
||||||
|
.L01: /* j<n/4 */
|
||||||
|
vxor.v U0, U0, U0
|
||||||
|
vxor.v U7, U7, U7
|
||||||
|
add.d AO2, AO1, LDA
|
||||||
|
add.d AO3, AO2, LDA
|
||||||
|
add.d AO4, AO3, LDA
|
||||||
|
|
||||||
|
move IX, $r0
|
||||||
|
move I, $r0
|
||||||
|
move II, $r0
|
||||||
|
|
||||||
|
beq $r0, M, .L03
|
||||||
|
|
||||||
|
.L02: /* i<m */
|
||||||
|
vldx U1, X, IX
|
||||||
|
fldx.d $f2, AO1, II
|
||||||
|
fldx.d $f3, AO2, II
|
||||||
|
fldx.d $f4, AO3, II
|
||||||
|
fldx.d $f5, AO4, II
|
||||||
|
|
||||||
|
vshuf4i.d U1, U1, 0x00
|
||||||
|
vextrins.d U2, U3, 0x10
|
||||||
|
vextrins.d U4, U5, 0x10
|
||||||
|
|
||||||
|
vfmadd.d U0, U2, U1, U0 //temp1,2
|
||||||
|
vfmadd.d U7, U4, U1, U7 //temp3,4
|
||||||
|
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d II, II, 8
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, M, .L02
|
||||||
|
|
||||||
|
.L03:
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
add.d T3, T2, INCY
|
||||||
|
add.d T4, T3, INCY
|
||||||
|
|
||||||
|
fldx.d $f3, Y, T1
|
||||||
|
fldx.d $f4, Y, T2
|
||||||
|
fldx.d $f5, Y, T3
|
||||||
|
fldx.d $f6, Y, T4
|
||||||
|
|
||||||
|
vextrins.d U3, U4, 0x10
|
||||||
|
vextrins.d U5, U6, 0x10
|
||||||
|
|
||||||
|
vfmadd.d U3, VALPHA, U0, U3
|
||||||
|
vfmadd.d U5, VALPHA, U7, U5
|
||||||
|
|
||||||
|
vextrins.d U4, U3, 0x01
|
||||||
|
vextrins.d U6, U5, 0x01
|
||||||
|
|
||||||
|
fstx.d $f3, Y, T1
|
||||||
|
fstx.d $f4, Y, T2
|
||||||
|
fstx.d $f5, Y, T3
|
||||||
|
fstx.d $f6, Y, T4
|
||||||
|
|
||||||
|
slli.d T1, LDA, 2
|
||||||
|
add.d AO1, AO1, T1
|
||||||
|
add.d IY, T4, INCY
|
||||||
|
|
||||||
|
addi.d J, J, 1
|
||||||
|
blt J, T0, .L01
|
||||||
|
|
||||||
|
.L04: /* if(n&2) */
|
||||||
|
andi T0, N, 2
|
||||||
|
beq $r0, T0, .L07
|
||||||
|
|
||||||
|
vxor.v U0, U0, U0
|
||||||
|
|
||||||
|
add.d AO2, AO1, LDA
|
||||||
|
|
||||||
|
move IX, $r0
|
||||||
|
move I, $r0
|
||||||
|
move II, $r0
|
||||||
|
|
||||||
|
beq $r0, M, .L06
|
||||||
|
|
||||||
|
.L05: /* i<m */
|
||||||
|
vldx U1, X, IX
|
||||||
|
fldx.d $f2, AO1, II
|
||||||
|
fldx.d $f3, AO2, II
|
||||||
|
|
||||||
|
vshuf4i.d U1, U1, 0x00
|
||||||
|
vextrins.d U2, U3, 0x10
|
||||||
|
|
||||||
|
vfmadd.d U0, U2, U1, U0 //temp1,2
|
||||||
|
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d II, II, 8
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, M, .L05
|
||||||
|
|
||||||
|
.L06:
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
|
||||||
|
fldx.d a1, Y, T1
|
||||||
|
fldx.d a2, Y, T2
|
||||||
|
|
||||||
|
vextrins.d U3, U4, 0x10
|
||||||
|
|
||||||
|
vfmadd.d U3, VALPHA, U0, U3
|
||||||
|
|
||||||
|
vextrins.d U4, U3, 0x01
|
||||||
|
|
||||||
|
fstx.d a1, Y, T1
|
||||||
|
fstx.d a2, Y, T2
|
||||||
|
|
||||||
|
slli.d T0, LDA, 1
|
||||||
|
add.d AO1, AO1, T0
|
||||||
|
add.d IY, T2, INCY
|
||||||
|
|
||||||
|
.L07: /* if(n&1) */
|
||||||
|
andi T0, N, 1
|
||||||
|
beq $r0, T0, .L999
|
||||||
|
|
||||||
|
MTC a1, $r0
|
||||||
|
|
||||||
|
move IX, $r0
|
||||||
|
move I, $r0
|
||||||
|
move II, $r0
|
||||||
|
|
||||||
|
beq $r0, M, .L09
|
||||||
|
|
||||||
|
.L08: /* i<m */
|
||||||
|
fldx.d a3, X, IX
|
||||||
|
fldx.d a4, AO1, II
|
||||||
|
|
||||||
|
fmadd.d a1, a4, a3, a1 //temp1
|
||||||
|
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d II, II, 8
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, M, .L08
|
||||||
|
|
||||||
|
.L09:
|
||||||
|
fldx.d a3, Y, IY
|
||||||
|
|
||||||
|
fmadd.d a3, ALPHA, a1, a3
|
||||||
|
|
||||||
|
fstx.d a3, Y, IY
|
||||||
|
|
||||||
|
add.d AO1, AO1, LDA
|
||||||
|
add.d IY, IY, INCY
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
LDARG $r23, $sp, 0
|
||||||
|
LDARG $r24, $sp, 8
|
||||||
|
LDARG $r25, $sp, 16
|
||||||
|
LDARG $r26, $sp, 32
|
||||||
|
LDARG $r27, $sp, 40
|
||||||
|
LDARG $r28, $sp, 48
|
||||||
|
LDARG $r29, $sp, 56
|
||||||
|
LDARG $r30, $sp, 64
|
||||||
|
LD ALPHA, $sp, 72
|
||||||
|
addi.d $sp, $sp, 80
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.ifeqs "\suf_op", "s"
|
.ifeqs "\suf_op", "s"
|
||||||
vpackod.d \out, \in, \in
|
vpackod.d \out, \in, \in
|
||||||
\pre_op\()add.\suf_op \out, \out, \in
|
\pre_op\()add.\suf_op \out, \out, \in
|
||||||
|
.else
|
||||||
|
vor.v \out, \in, \in
|
||||||
.endif
|
.endif
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
|
|
||||||
.ifnb \more
|
.ifnb \more
|
||||||
GCOMPLEXACC \pre_op, \suf_op, \more
|
GCOMPLEXACC \pre_op, \suf_op, \more
|
||||||
.endif
|
.endif
|
||||||
|
|
|
@ -0,0 +1,227 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
/* Param */
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INCX $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INCY $r6
|
||||||
|
#define BUFFER $r16
|
||||||
|
#define ALPHA $f0
|
||||||
|
|
||||||
|
#define YORIG $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define T1 $r20
|
||||||
|
#define XX $r12
|
||||||
|
#define YY $r13
|
||||||
|
#define I $r14
|
||||||
|
#define J $r15
|
||||||
|
#define AO1 $r23
|
||||||
|
#define AO2 $r24
|
||||||
|
#define IX $r25
|
||||||
|
#define IY $r26
|
||||||
|
#define II $r27
|
||||||
|
#define T2 $r28
|
||||||
|
#define T3 $r29
|
||||||
|
#define T4 $r30
|
||||||
|
|
||||||
|
/* LSX vectors */
|
||||||
|
#define U0 $vr11
|
||||||
|
#define U1 $vr12
|
||||||
|
#define U2 $vr2
|
||||||
|
#define U3 $vr3
|
||||||
|
#define U4 $vr4
|
||||||
|
#define U5 $vr5
|
||||||
|
#define U6 $vr6
|
||||||
|
#define U7 $vr7
|
||||||
|
#define U8 $vr8
|
||||||
|
#define U9 $vr9
|
||||||
|
#define VALPHA $vr10
|
||||||
|
|
||||||
|
#define a1 $f3
|
||||||
|
#define a2 $f4
|
||||||
|
#define a3 $f5
|
||||||
|
#define a4 $f6
|
||||||
|
#define a5 $f7
|
||||||
|
#define a6 $f8
|
||||||
|
#define a7 $f9
|
||||||
|
#define a8 $f10
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
LDARG INCY, $sp, 0
|
||||||
|
LDARG BUFFER, $sp, 8
|
||||||
|
|
||||||
|
addi.d $sp, $sp, -80
|
||||||
|
|
||||||
|
SDARG $r23, $sp, 0
|
||||||
|
SDARG $r24, $sp, 8
|
||||||
|
SDARG $r25, $sp, 16
|
||||||
|
SDARG $r26, $sp, 32
|
||||||
|
SDARG $r27, $sp, 40
|
||||||
|
SDARG $r28, $sp, 48
|
||||||
|
SDARG $r29, $sp, 56
|
||||||
|
SDARG $r30, $sp, 64
|
||||||
|
ST ALPHA, $sp, 72
|
||||||
|
|
||||||
|
vldrepl.w VALPHA, $sp, 72
|
||||||
|
|
||||||
|
slli.d LDA, LDA, BASE_SHIFT
|
||||||
|
slli.d INCX, INCX, BASE_SHIFT
|
||||||
|
slli.d INCY, INCY, BASE_SHIFT
|
||||||
|
|
||||||
|
bge $r0, M, .L999
|
||||||
|
bge $r0, N, .L999
|
||||||
|
|
||||||
|
move J, $r0
|
||||||
|
move IX, $r0
|
||||||
|
|
||||||
|
move AO1, A //a_ptr
|
||||||
|
move XX, X
|
||||||
|
move YY, Y
|
||||||
|
|
||||||
|
beq J, M, .L999
|
||||||
|
|
||||||
|
.L01:
|
||||||
|
vldx U0, XX, IX
|
||||||
|
vpermi.w U0, U0, 0x00
|
||||||
|
|
||||||
|
vfmul.s U1, VALPHA, U0 //temp1
|
||||||
|
|
||||||
|
move IY, $r0
|
||||||
|
move II, $r0
|
||||||
|
move I, $r0
|
||||||
|
|
||||||
|
srai.d T0, M, 2 //n/4
|
||||||
|
beq I, T0, .L03
|
||||||
|
|
||||||
|
.L02:
|
||||||
|
vldx U2, AO1, II
|
||||||
|
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
add.d T3, T2, INCY
|
||||||
|
add.d T4, T3, INCY
|
||||||
|
|
||||||
|
fldx.s a1, YY, T1
|
||||||
|
fldx.s a2, YY, T2
|
||||||
|
fldx.s a3, YY, T3
|
||||||
|
fldx.s a4, YY, T4
|
||||||
|
|
||||||
|
vextrins.w U3, U4, 0x10
|
||||||
|
vextrins.w U3, U5, 0x20
|
||||||
|
vextrins.w U3, U6, 0x30
|
||||||
|
|
||||||
|
vfmadd.s U3, U1, U2, U3
|
||||||
|
|
||||||
|
vextrins.w U4, U3, 0x01
|
||||||
|
vextrins.w U5, U3, 0x02
|
||||||
|
vextrins.w U6, U3, 0x03
|
||||||
|
|
||||||
|
fstx.s a1, YY, T1
|
||||||
|
fstx.s a2, YY, T2
|
||||||
|
fstx.s a3, YY, T3
|
||||||
|
fstx.s a4, YY, T4
|
||||||
|
|
||||||
|
add.d IY, T4, INCY
|
||||||
|
addi.d II, II, 16
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, T0, .L02
|
||||||
|
|
||||||
|
.L03:
|
||||||
|
andi T0, M, 2
|
||||||
|
beq $r0, T0, .L04
|
||||||
|
|
||||||
|
addi.d T1, $r0, 4
|
||||||
|
mod.d T1, M, T1
|
||||||
|
sub.d II, M, T1
|
||||||
|
slli.d II, II, BASE_SHIFT
|
||||||
|
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
|
||||||
|
fldx.s a1, AO1, II
|
||||||
|
addi.d T0, II, 4
|
||||||
|
fldx.s a2, AO1, T0
|
||||||
|
|
||||||
|
fldx.s a3, YY, T1
|
||||||
|
fldx.s a4, YY, T2
|
||||||
|
|
||||||
|
fmadd.s a3, $f12, a1, a3
|
||||||
|
fmadd.s a4, $f12, a2, a4
|
||||||
|
|
||||||
|
fstx.s a3, YY, T1
|
||||||
|
fstx.s a4, YY, T2
|
||||||
|
|
||||||
|
add.d IY, T2, INCY
|
||||||
|
|
||||||
|
.L04:
|
||||||
|
andi T0, M, 1
|
||||||
|
beq $r0, T0, .L05
|
||||||
|
|
||||||
|
addi.d II, M, -1
|
||||||
|
slli.d II, II, BASE_SHIFT
|
||||||
|
|
||||||
|
fldx.s a1, AO1, II
|
||||||
|
fldx.s a3, YY, IY
|
||||||
|
|
||||||
|
fmadd.s a3, $f12, a1, a3
|
||||||
|
|
||||||
|
fstx.s a3, YY, IY
|
||||||
|
|
||||||
|
add.d IY, IY, INCY
|
||||||
|
|
||||||
|
.L05:
|
||||||
|
add.d AO1, AO1, LDA
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d J, J, 1
|
||||||
|
blt J, N, .L01
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
LDARG $r23, $sp, 0
|
||||||
|
LDARG $r24, $sp, 8
|
||||||
|
LDARG $r25, $sp, 16
|
||||||
|
LDARG $r26, $sp, 32
|
||||||
|
LDARG $r27, $sp, 40
|
||||||
|
LDARG $r28, $sp, 48
|
||||||
|
LDARG $r29, $sp, 56
|
||||||
|
LDARG $r30, $sp, 64
|
||||||
|
LD ALPHA, $sp, 72
|
||||||
|
addi.d $sp, $sp, 80
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,275 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
/* Param */
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INCX $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INCY $r6
|
||||||
|
#define BUFFER $r16
|
||||||
|
#define ALPHA $f0
|
||||||
|
|
||||||
|
#define YORIG $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define T1 $r20
|
||||||
|
#define AO3 $r12
|
||||||
|
#define AO4 $r13
|
||||||
|
#define I $r14
|
||||||
|
#define J $r15
|
||||||
|
#define AO1 $r23
|
||||||
|
#define AO2 $r24
|
||||||
|
#define IX $r25
|
||||||
|
#define IY $r26
|
||||||
|
#define II $r27
|
||||||
|
#define T2 $r28
|
||||||
|
#define T3 $r29
|
||||||
|
#define T4 $r30
|
||||||
|
|
||||||
|
/* LSX vectors */
|
||||||
|
#define U0 $vr11
|
||||||
|
#define U1 $vr12
|
||||||
|
#define U2 $vr2
|
||||||
|
#define U3 $vr3
|
||||||
|
#define U4 $vr4
|
||||||
|
#define U5 $vr5
|
||||||
|
#define U6 $vr6
|
||||||
|
#define U7 $vr7
|
||||||
|
#define U8 $vr8
|
||||||
|
#define U9 $vr9
|
||||||
|
#define VALPHA $vr10
|
||||||
|
|
||||||
|
#define a1 $f3
|
||||||
|
#define a2 $f4
|
||||||
|
#define a3 $f5
|
||||||
|
#define a4 $f6
|
||||||
|
#define a5 $f7
|
||||||
|
#define a6 $f8
|
||||||
|
#define a7 $f9
|
||||||
|
#define a8 $f10
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
LDARG INCY, $sp, 0
|
||||||
|
LDARG BUFFER, $sp, 8
|
||||||
|
|
||||||
|
addi.d $sp, $sp, -80
|
||||||
|
|
||||||
|
SDARG $r23, $sp, 0
|
||||||
|
SDARG $r24, $sp, 8
|
||||||
|
SDARG $r25, $sp, 16
|
||||||
|
SDARG $r26, $sp, 32
|
||||||
|
SDARG $r27, $sp, 40
|
||||||
|
SDARG $r28, $sp, 48
|
||||||
|
SDARG $r29, $sp, 56
|
||||||
|
SDARG $r30, $sp, 64
|
||||||
|
ST ALPHA, $sp, 72
|
||||||
|
|
||||||
|
vldrepl.w VALPHA, $sp, 72
|
||||||
|
|
||||||
|
slli.d LDA, LDA, BASE_SHIFT
|
||||||
|
slli.d INCX, INCX, BASE_SHIFT
|
||||||
|
slli.d INCY, INCY, BASE_SHIFT
|
||||||
|
|
||||||
|
bge $r0, M, .L999
|
||||||
|
bge $r0, N, .L999
|
||||||
|
|
||||||
|
move J, $r0
|
||||||
|
move IY, $r0
|
||||||
|
|
||||||
|
move AO1, A //a_ptr1
|
||||||
|
|
||||||
|
srai.d T0, N, 2 //n/4
|
||||||
|
beq J, T0, .L04
|
||||||
|
|
||||||
|
.L01: /* j<n/4 */
|
||||||
|
vxor.v U0, U0, U0
|
||||||
|
add.d AO2, AO1, LDA
|
||||||
|
add.d AO3, AO2, LDA
|
||||||
|
add.d AO4, AO3, LDA
|
||||||
|
|
||||||
|
move IX, $r0
|
||||||
|
move I, $r0
|
||||||
|
move II, $r0
|
||||||
|
|
||||||
|
beq $r0, M, .L03
|
||||||
|
|
||||||
|
.L02: /* i<m */
|
||||||
|
vldx U1, X, IX
|
||||||
|
fldx.s $f2, AO1, II
|
||||||
|
fldx.s $f3, AO2, II
|
||||||
|
fldx.s $f4, AO3, II
|
||||||
|
fldx.s $f5, AO4, II
|
||||||
|
|
||||||
|
vpermi.w U1, U1, 0x00
|
||||||
|
vextrins.w U2, U3, 0x10
|
||||||
|
vextrins.w U2, U4, 0x20
|
||||||
|
vextrins.w U2, U5, 0x30
|
||||||
|
|
||||||
|
vfmadd.s U0, U2, U1, U0 //temp1,2,3,4
|
||||||
|
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d II, II, 4
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, M, .L02
|
||||||
|
|
||||||
|
.L03:
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
add.d T3, T2, INCY
|
||||||
|
add.d T4, T3, INCY
|
||||||
|
|
||||||
|
fldx.s a1, Y, T1
|
||||||
|
fldx.s a2, Y, T2
|
||||||
|
fldx.s a3, Y, T3
|
||||||
|
fldx.s a4, Y, T4
|
||||||
|
|
||||||
|
vextrins.w U3, U4, 0x10
|
||||||
|
vextrins.w U3, U5, 0x20
|
||||||
|
vextrins.w U3, U6, 0x30
|
||||||
|
|
||||||
|
vfmadd.s U3, VALPHA, U0, U3
|
||||||
|
|
||||||
|
vextrins.w U4, U3, 0x01
|
||||||
|
vextrins.w U5, U3, 0x02
|
||||||
|
vextrins.w U6, U3, 0x03
|
||||||
|
|
||||||
|
fstx.s a1, Y, T1
|
||||||
|
fstx.s a2, Y, T2
|
||||||
|
fstx.s a3, Y, T3
|
||||||
|
fstx.s a4, Y, T4
|
||||||
|
|
||||||
|
slli.d T1, LDA, 2
|
||||||
|
add.d AO1, AO1, T1
|
||||||
|
add.d IY, T4, INCY
|
||||||
|
|
||||||
|
addi.d J, J, 1
|
||||||
|
blt J, T0, .L01
|
||||||
|
|
||||||
|
.L04: /* if(n&2) */
|
||||||
|
andi T0, N, 2
|
||||||
|
beq $r0, T0, .L07
|
||||||
|
|
||||||
|
MTC a1, $r0
|
||||||
|
MTC a2, $r0
|
||||||
|
|
||||||
|
add.d AO2, AO1, LDA
|
||||||
|
|
||||||
|
move IX, $r0
|
||||||
|
move I, $r0
|
||||||
|
move II, $r0
|
||||||
|
|
||||||
|
beq $r0, M, .L06
|
||||||
|
|
||||||
|
.L05: /* i<m */
|
||||||
|
fldx.s a3, X, IX
|
||||||
|
fldx.s a4, AO1, II
|
||||||
|
fldx.s a5, AO2, II
|
||||||
|
|
||||||
|
fmadd.s a1, a4, a3, a1 //temp1
|
||||||
|
fmadd.s a2, a5, a3, a2 //temp2
|
||||||
|
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d II, II, 4
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, M, .L05
|
||||||
|
|
||||||
|
.L06:
|
||||||
|
move T1, IY
|
||||||
|
add.d T2, T1, INCY
|
||||||
|
|
||||||
|
fldx.s a3, Y, T1
|
||||||
|
fldx.s a4, Y, T2
|
||||||
|
|
||||||
|
fmadd.s a3, ALPHA, a1, a3
|
||||||
|
fmadd.s a4, ALPHA, a2, a4
|
||||||
|
|
||||||
|
fstx.s a3, Y, T1
|
||||||
|
fstx.s a4, Y, T2
|
||||||
|
|
||||||
|
slli.d T0, LDA, 1
|
||||||
|
add.d AO1, AO1, T0
|
||||||
|
add.d IY, T2, INCY
|
||||||
|
|
||||||
|
.L07: /* if(n&1) */
|
||||||
|
andi T0, N, 1
|
||||||
|
beq $r0, T0, .L999
|
||||||
|
|
||||||
|
MTC a1, $r0
|
||||||
|
|
||||||
|
move IX, $r0
|
||||||
|
move I, $r0
|
||||||
|
move II, $r0
|
||||||
|
|
||||||
|
beq $r0, M, .L09
|
||||||
|
|
||||||
|
.L08: /* i<m */
|
||||||
|
fldx.s a3, X, IX
|
||||||
|
fldx.s a4, AO1, II
|
||||||
|
|
||||||
|
fmadd.s a1, a4, a3, a1 //temp1
|
||||||
|
|
||||||
|
add.d IX, IX, INCX
|
||||||
|
|
||||||
|
addi.d II, II, 4
|
||||||
|
addi.d I, I, 1
|
||||||
|
blt I, M, .L08
|
||||||
|
|
||||||
|
.L09:
|
||||||
|
fldx.s a3, Y, IY
|
||||||
|
|
||||||
|
fmadd.s a3, ALPHA, a1, a3
|
||||||
|
|
||||||
|
fstx.s a3, Y, IY
|
||||||
|
|
||||||
|
add.d AO1, AO1, LDA
|
||||||
|
add.d IY, IY, INCY
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
LDARG $r23, $sp, 0
|
||||||
|
LDARG $r24, $sp, 8
|
||||||
|
LDARG $r25, $sp, 16
|
||||||
|
LDARG $r26, $sp, 32
|
||||||
|
LDARG $r27, $sp, 40
|
||||||
|
LDARG $r28, $sp, 48
|
||||||
|
LDARG $r29, $sp, 56
|
||||||
|
LDARG $r30, $sp, 64
|
||||||
|
LD ALPHA, $sp, 72
|
||||||
|
addi.d $sp, $sp, 80
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,296 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "loongarch64_asm.S"
|
||||||
|
|
||||||
|
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||||
|
*/
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define ALPHA_R $f0
|
||||||
|
#define ALPHA_I $f1
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INC_X $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INC_Y $r6
|
||||||
|
|
||||||
|
#define J $r12
|
||||||
|
#define I $r13
|
||||||
|
#define K $r14
|
||||||
|
#define Y_ORG $r15
|
||||||
|
#define OFFSET $r16
|
||||||
|
#define K_LDA $r17
|
||||||
|
#define M16 $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define PA0 $r20
|
||||||
|
#define PA1 $r23
|
||||||
|
#define PA2 $r24
|
||||||
|
#define PA3 $r25
|
||||||
|
#define PA4 $r26
|
||||||
|
#define PA5 $r27
|
||||||
|
#define PA6 $r28
|
||||||
|
#define PA7 $r29
|
||||||
|
|
||||||
|
#define VALPHA $vr1
|
||||||
|
#define X0 $vr2
|
||||||
|
#define X1 $vr3
|
||||||
|
#define X2 $vr4
|
||||||
|
#define X3 $vr5
|
||||||
|
#define X4 $vr6
|
||||||
|
#define X5 $vr7
|
||||||
|
#define X6 $vr8
|
||||||
|
#define X7 $vr9
|
||||||
|
#define Y0 $vr10
|
||||||
|
#define Y1 $vr11
|
||||||
|
#define A0 $vr12
|
||||||
|
#define A1 $vr13
|
||||||
|
#define A2 $vr14
|
||||||
|
#define A3 $vr15
|
||||||
|
#define A4 $vr16
|
||||||
|
#define A5 $vr17
|
||||||
|
#define A6 $vr18
|
||||||
|
#define A7 $vr19
|
||||||
|
#define A8 $vr20
|
||||||
|
#define A9 $vr21
|
||||||
|
#define A10 $vr22
|
||||||
|
#define A11 $vr23
|
||||||
|
#define A12 $vr24
|
||||||
|
#define A13 $vr25
|
||||||
|
#define A14 $vr26
|
||||||
|
#define A15 $vr27
|
||||||
|
#define TMP0 $vr28
|
||||||
|
#define TMP1 $vr29
|
||||||
|
#define TMP2 $vr30
|
||||||
|
|
||||||
|
#if !defined(CONJ)
|
||||||
|
#if !defined(XCONJ)
|
||||||
|
#define GXCONJ 0
|
||||||
|
#define GCONJ 0
|
||||||
|
#else
|
||||||
|
#define GXCONJ 1
|
||||||
|
#define GCONJ 0
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#if !defined(XCONJ)
|
||||||
|
#define GXCONJ 0
|
||||||
|
#define GCONJ 1
|
||||||
|
#else
|
||||||
|
#define GXCONJ 1
|
||||||
|
#define GCONJ 1
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.macro ZLOAD_X_2
|
||||||
|
GLD v, , X0, X, 0x00, X1, X, 0x10
|
||||||
|
GCOMPLEXMUL GXCONJ, \
|
||||||
|
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
|
X1, VALPHA, X1, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZLOAD_X_2_GAP
|
||||||
|
vld X0, X, 0
|
||||||
|
PTR_ADD T0, X, INC_X
|
||||||
|
vld X1, T0, 0
|
||||||
|
|
||||||
|
GCOMPLEXMUL GXCONJ, \
|
||||||
|
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
|
X1, VALPHA, X1, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZLOAD_X_1
|
||||||
|
GLD v, , X0, X, 0x00
|
||||||
|
GCOMPLEXMUL GXCONJ, \
|
||||||
|
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZLOAD_Y_2
|
||||||
|
GLD v, , Y0, Y, 0, Y1, Y, 0x10
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZLOAD_Y_2_GAP
|
||||||
|
vld $vr10, Y, 0
|
||||||
|
vldx $vr11, Y, INC_Y
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZLOAD_Y_1
|
||||||
|
vld $vr10, Y, 0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZGEMV_N_2x2
|
||||||
|
GLD_INC v, , 0x10, \
|
||||||
|
A0, PA0, 0, A1, PA0, 0, \
|
||||||
|
A2, PA1, 0, A3, PA1, 0
|
||||||
|
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||||
|
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZGEMV_N_1x2
|
||||||
|
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0
|
||||||
|
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||||
|
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
|
||||||
|
Y0, X1, A2, Y0, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZGEMV_N_1x1
|
||||||
|
GLD_INC v, , 0x10, $vr12, PA0, 0
|
||||||
|
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||||
|
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZSTORE_Y_2
|
||||||
|
GST v, , Y0, Y, 0, Y1, Y, 0x10
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZSTORE_Y_2_GAP
|
||||||
|
vst Y0, Y, 0
|
||||||
|
vstx Y1, Y, INC_Y
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZSTORE_Y_1
|
||||||
|
vst $vr10, Y, 0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req
|
||||||
|
PTR_SRLI J, N, 1
|
||||||
|
beqz J, .L_\XW\()_N_1
|
||||||
|
PTR_SLLI K_LDA, LDA, 1
|
||||||
|
PTR_SUB K_LDA, K_LDA, M16
|
||||||
|
.L_\XW\()_N_L2:
|
||||||
|
ZLOAD_\X_2
|
||||||
|
xor K, K, K
|
||||||
|
move Y, Y_ORG
|
||||||
|
PTR_SRLI I, M, 1
|
||||||
|
beqz I, .L_\XW\()_M_1
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L2:
|
||||||
|
ZLOAD_\Y_2
|
||||||
|
ZGEMV_N_2x2
|
||||||
|
ZSTORE_\Y_2
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ALSL Y, INC_Y, Y, 1
|
||||||
|
PTR_ADDI K, K, 4
|
||||||
|
bnez I, .L_\XW\()_M_L2
|
||||||
|
.L_\XW\()_M_1:
|
||||||
|
andi I, M, 1
|
||||||
|
beqz I, .L_\XW\()_M_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L1:
|
||||||
|
ZLOAD_\Y_1
|
||||||
|
ZGEMV_N_1x2
|
||||||
|
ZSTORE_\Y_1
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD Y, Y, INC_Y
|
||||||
|
PTR_ADDI K, K, 1
|
||||||
|
bnez I, .L_\XW\()_M_L1
|
||||||
|
.L_\XW\()_M_END:
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||||
|
#endif
|
||||||
|
PTR_ALSL X, INC_X, X, 1
|
||||||
|
bnez J, .L_\XW\()_N_L2
|
||||||
|
.L_\XW\()_N_1:
|
||||||
|
andi J, N, 1
|
||||||
|
beqz J, .L_END
|
||||||
|
.L_\XW\()_N_L1:
|
||||||
|
ZLOAD_\X_1
|
||||||
|
xor K, K, K
|
||||||
|
move Y, Y_ORG
|
||||||
|
move I, M
|
||||||
|
beqz I, .L_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_N_1_M_L1:
|
||||||
|
ZLOAD_\Y_1
|
||||||
|
ZGEMV_N_1x1
|
||||||
|
ZSTORE_\Y_1
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD Y, Y, INC_Y
|
||||||
|
PTR_ADDI K, K, 1
|
||||||
|
bnez I, .L_\XW\()_N_1_M_L1
|
||||||
|
.L_\XW\()_N_1_M_END:
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
PTR_SUB K_LDA, LDA, M16
|
||||||
|
PTR_ADD PA0, PA0, K_LDA
|
||||||
|
PTR_ADD X, X, INC_X
|
||||||
|
bnez J, .L_\XW\()_N_L1
|
||||||
|
|
||||||
|
b .L_END
|
||||||
|
.endm
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PTR_LD INC_Y, $sp, 0
|
||||||
|
push_if_used 17 + 7, 31
|
||||||
|
PTR_ADDI K, $r0, 0x01
|
||||||
|
PTR_SUB I, INC_X, K
|
||||||
|
PTR_SUB J, INC_Y, K
|
||||||
|
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||||
|
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
|
||||||
|
PTR_ALSL I, I, J, 1
|
||||||
|
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
|
||||||
|
// Init VALPHA
|
||||||
|
vpackev.d VALPHA, $vr1, $vr0
|
||||||
|
move Y_ORG, Y
|
||||||
|
move PA0, A
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA1, PA0, LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA1, PA0, LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA1, PA0, LDA
|
||||||
|
#endif
|
||||||
|
la.local T0, .L_GAP_TABLE
|
||||||
|
PTR_ALSL I, I, T0, 1
|
||||||
|
ld.h K, I, 0 // Obtain the offset address
|
||||||
|
PTR_ADD T0, T0, K
|
||||||
|
jirl $r0, T0, 0
|
||||||
|
.L_GAP_TABLE:
|
||||||
|
.hword .L_GAP_0_0 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_0_1 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_1_0 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_1_1 - .L_GAP_TABLE
|
||||||
|
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
|
||||||
|
ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1
|
||||||
|
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
|
||||||
|
ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1
|
||||||
|
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
|
||||||
|
ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1
|
||||||
|
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||||
|
ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
|
||||||
|
.L_END:
|
||||||
|
pop_if_used 17 + 7, 31
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
EPILOGUE
|
|
@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
|
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
|
||||||
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
|
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
|
||||||
GCOMPLEXMUL GXCONJ, \
|
GCOMPLEXMUL GXCONJ, \
|
||||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
|
||||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
|
||||||
X3, X3, VALPHA, TMP0, TMP1, TMP2
|
X3, VALPHA, X3, TMP0, TMP1, TMP2
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro ZLOAD_X_4_GAP
|
.macro ZLOAD_X_4_GAP
|
||||||
|
@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvpermi.q X3, X3, 0
|
xvpermi.q X3, X3, 0
|
||||||
|
|
||||||
GCOMPLEXMUL GXCONJ, \
|
GCOMPLEXMUL GXCONJ, \
|
||||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
|
||||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
|
||||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
|
||||||
X3, X3, VALPHA, TMP0, TMP1, TMP2
|
X3, VALPHA, X3, TMP0, TMP1, TMP2
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro ZLOAD_Y_4
|
.macro ZLOAD_Y_4
|
||||||
|
@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
GLD xv, , X0, X, 0x00
|
GLD xv, , X0, X, 0x00
|
||||||
GPERMI xv, q, X0, X0, 0
|
GPERMI xv, q, X0, X0, 0
|
||||||
GCOMPLEXMUL GXCONJ, \
|
GCOMPLEXMUL GXCONJ, \
|
||||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
|
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro ZGEMV_N_1x1
|
.macro ZGEMV_N_1x1
|
||||||
|
|
|
@ -0,0 +1,268 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "loongarch64_asm.S"
|
||||||
|
|
||||||
|
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||||
|
*/
|
||||||
|
#define M $r4
|
||||||
|
#define N $r5
|
||||||
|
#define ALPHA_R $f0
|
||||||
|
#define ALPHA_I $f1
|
||||||
|
#define A $r7
|
||||||
|
#define LDA $r8
|
||||||
|
#define X $r9
|
||||||
|
#define INC_X $r10
|
||||||
|
#define Y $r11
|
||||||
|
#define INC_Y $r6
|
||||||
|
|
||||||
|
#define J $r12
|
||||||
|
#define I $r13
|
||||||
|
#define K $r14
|
||||||
|
#define PY0 $r14
|
||||||
|
#define X_ORG $r15
|
||||||
|
#define PY1 $r16
|
||||||
|
#define K_LDA $r17
|
||||||
|
#define PY2 $r18
|
||||||
|
#define T0 $r19
|
||||||
|
#define PA0 $r20
|
||||||
|
#define PA1 $r23
|
||||||
|
#define PA2 $r24
|
||||||
|
#define PA3 $r25
|
||||||
|
#define PA4 $r26
|
||||||
|
#define PA5 $r27
|
||||||
|
#define PA6 $r28
|
||||||
|
#define PA7 $r29
|
||||||
|
#define M16 $r30
|
||||||
|
|
||||||
|
#define VALPHA $vr0
|
||||||
|
#define X0 $vr1
|
||||||
|
#define X1 $vr2
|
||||||
|
#define A0 $vr3
|
||||||
|
#define A1 $vr4
|
||||||
|
#define A2 $vr5
|
||||||
|
#define A3 $vr6
|
||||||
|
#define A4 $vr7
|
||||||
|
#define A5 $vr8
|
||||||
|
#define A6 $vr9
|
||||||
|
#define A7 $vr10
|
||||||
|
#define A8 $vr11
|
||||||
|
#define A9 $vr12
|
||||||
|
#define A10 $vr13
|
||||||
|
#define A11 $vr14
|
||||||
|
#define A12 $vr15
|
||||||
|
#define A13 $vr16
|
||||||
|
#define A14 $vr17
|
||||||
|
#define A15 $vr18
|
||||||
|
#define TP0 $vr19
|
||||||
|
#define TP1 $vr20
|
||||||
|
#define TP2 $vr21
|
||||||
|
#define TP3 $vr22
|
||||||
|
#define TP4 $vr23
|
||||||
|
#define TP5 $vr24
|
||||||
|
#define TP6 $vr25
|
||||||
|
#define TP7 $vr26
|
||||||
|
#define TMP0 $vr27
|
||||||
|
#define TMP1 $vr28
|
||||||
|
#define TMP2 $vr29
|
||||||
|
#define Y0 $vr3
|
||||||
|
#define Y1 $vr4
|
||||||
|
#define Y2 $vr5
|
||||||
|
#define Y3 $vr6
|
||||||
|
#define Y4 $vr7
|
||||||
|
#define Y5 $vr8
|
||||||
|
#define Y6 $vr9
|
||||||
|
#define Y7 $vr10
|
||||||
|
|
||||||
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
|
#define GXCONJ1 0
|
||||||
|
#define GCONJ1 0
|
||||||
|
#else
|
||||||
|
#define GXCONJ1 1
|
||||||
|
#define GCONJ1 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(XCONJ)
|
||||||
|
#define GXCONJ2 0
|
||||||
|
#define GCONJ2 0
|
||||||
|
#else
|
||||||
|
#define GXCONJ2 0
|
||||||
|
#define GCONJ2 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.macro ZERO_Y2
|
||||||
|
GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZERO_Y1
|
||||||
|
GXOR v, v, TP0, TP0, TP0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZLOAD_X2
|
||||||
|
GLD v, , X0, X, 0x00, X1, X, 0x10
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZLOAD_X2_GAP
|
||||||
|
vld X0, X, 0
|
||||||
|
vldx X1, X, INC_X
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZGEMV_T_2x2
|
||||||
|
GLD_INC v, , 0x10, \
|
||||||
|
A0, PA0, 0, A1, PA0, 0, \
|
||||||
|
A2, PA1, 0, A3, PA1, 0
|
||||||
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||||
|
vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
|
||||||
|
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ZGEMV_T_LSX XW:req, X2:req
|
||||||
|
PTR_SRLI J, N, 1
|
||||||
|
beqz J, .L_\XW\()_N_1
|
||||||
|
PTR_SLLI K_LDA, LDA, 1
|
||||||
|
PTR_SUB K_LDA, K_LDA, M16
|
||||||
|
.L_\XW\()_N_L2:
|
||||||
|
ZERO_Y2
|
||||||
|
move X, X_ORG
|
||||||
|
PTR_SRLI I, M, 1
|
||||||
|
beqz I, .L_\XW\()_M_1
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L2:
|
||||||
|
ZLOAD_\X2
|
||||||
|
ZGEMV_T_2x2
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ALSL X, INC_X, X, 1
|
||||||
|
bnez I, .L_\XW\()_M_L2
|
||||||
|
.L_\XW\()_M_1:
|
||||||
|
// Accumulated
|
||||||
|
GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1
|
||||||
|
andi I, M, 1
|
||||||
|
beqz I, .L_\XW\()_M_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_M_L1:
|
||||||
|
GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10
|
||||||
|
#else
|
||||||
|
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||||
|
vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2
|
||||||
|
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD X, X, INC_X
|
||||||
|
bnez I, .L_\XW\()_M_L1
|
||||||
|
.L_\XW\()_M_END:
|
||||||
|
vld A8, Y, 0x00
|
||||||
|
vldx A9, Y, INC_Y
|
||||||
|
|
||||||
|
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||||
|
vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2
|
||||||
|
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||||
|
#endif
|
||||||
|
vst $vr11, Y, 0x00
|
||||||
|
vstx $vr12, Y, INC_Y
|
||||||
|
PTR_ALSL Y, INC_Y, Y, 1
|
||||||
|
bnez J, .L_\XW\()_N_L2
|
||||||
|
.L_\XW\()_N_1:
|
||||||
|
andi J, N, 1
|
||||||
|
beqz J, .L_END
|
||||||
|
PTR_SUB K_LDA, LDA, M16
|
||||||
|
.L_\XW\()_N_L1:
|
||||||
|
ZERO_Y1
|
||||||
|
move X, X_ORG
|
||||||
|
move I, M
|
||||||
|
beqz I, .L_END
|
||||||
|
.align 5
|
||||||
|
.L_\XW\()_N_1_M_L1:
|
||||||
|
GLD v, , A0, PA0, 0x00, X0, X, 0x00
|
||||||
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||||
|
vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
|
||||||
|
PTR_ADDI I, I, -1
|
||||||
|
PTR_ADD X, X, INC_X
|
||||||
|
PTR_ADDI PA0, PA0, 0x10
|
||||||
|
bnez I, .L_\XW\()_N_1_M_L1
|
||||||
|
.L_\XW\()_N_1_M_END:
|
||||||
|
PTR_ADDI J, J, -1
|
||||||
|
vld A0, Y, 0x00
|
||||||
|
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||||
|
vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
|
||||||
|
vst $vr3, Y, 0x00
|
||||||
|
PTR_ADD PA0, PA0, K_LDA
|
||||||
|
PTR_ADD Y, Y, INC_Y
|
||||||
|
bnez J, .L_\XW\()_N_L1
|
||||||
|
|
||||||
|
b .L_END
|
||||||
|
.endm
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PTR_LD INC_Y, $sp, 0
|
||||||
|
push_if_used 17 + 8, 30
|
||||||
|
PTR_ADDI K, $r0, 0x01
|
||||||
|
PTR_SUB I, INC_X, K
|
||||||
|
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||||
|
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
|
||||||
|
// Init VALPHA
|
||||||
|
vpackev.d VALPHA, $vr1, $vr0
|
||||||
|
move X_ORG, X
|
||||||
|
move PA0, A
|
||||||
|
#if __loongarch_grlen == 64
|
||||||
|
GADD , d, PA1, PA0, LDA
|
||||||
|
#elif __loongarch_grlen == 32
|
||||||
|
GADD , w, PA1, PA0, LDA
|
||||||
|
#else
|
||||||
|
GADD , d, PA1, PA0, LDA
|
||||||
|
#endif
|
||||||
|
la.local T0, .L_GAP_TABLE
|
||||||
|
PTR_ALSL I, I, T0, 1
|
||||||
|
ld.h K, I, 0
|
||||||
|
PTR_ADD T0, T0, K
|
||||||
|
jirl $r0, T0, 0
|
||||||
|
.L_GAP_TABLE:
|
||||||
|
.hword .L_GAP_0 - .L_GAP_TABLE
|
||||||
|
.hword .L_GAP_1 - .L_GAP_TABLE
|
||||||
|
.L_GAP_0: /* if (incx == 1) */
|
||||||
|
ZGEMV_T_LSX GAP_0, X2
|
||||||
|
.L_GAP_1: /* if (incx != 1) */
|
||||||
|
ZGEMV_T_LSX GAP_1, X2_GAP
|
||||||
|
.L_END:
|
||||||
|
pop_if_used 17 + 8, 30
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
EPILOGUE
|
|
@ -16,13 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
STRMMKERNEL = sgemm_kernel_power10.c
|
STRMMKERNEL = sgemm_kernel_power10.c
|
||||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||||
ifeq ($(OSNAME), AIX)
|
CTRMMKERNEL = cgemm_kernel_power10.c
|
||||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
ZTRMMKERNEL = zgemm_kernel_power10.c
|
||||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
|
||||||
else
|
|
||||||
CTRMMKERNEL = cgemm_kernel_power10.S
|
|
||||||
ZTRMMKERNEL = zgemm_kernel_power10.S
|
|
||||||
endif
|
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_power10.c
|
SGEMMKERNEL = sgemm_kernel_power10.c
|
||||||
SGEMMINCOPY = sgemm_ncopy_16_power.c
|
SGEMMINCOPY = sgemm_ncopy_16_power.c
|
||||||
|
@ -64,11 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
|
||||||
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
|
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
|
||||||
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
|
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
|
||||||
|
|
||||||
ifeq ($(OSNAME), AIX)
|
CGEMMKERNEL = cgemm_kernel_power10.c
|
||||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
|
||||||
else
|
|
||||||
CGEMMKERNEL = cgemm_kernel_power10.S
|
|
||||||
endif
|
|
||||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
ifeq ($(OSNAME), AIX)
|
ifeq ($(OSNAME), AIX)
|
||||||
|
@ -83,11 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ifeq ($(OSNAME), AIX)
|
ZGEMMKERNEL = zgemm_kernel_power10.c
|
||||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
|
||||||
else
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_power10.S
|
|
||||||
endif
|
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,736 @@
|
||||||
|
/*********************************************************************************
|
||||||
|
Copyright (c) 2020, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
**********************************************************************************/
|
||||||
|
#include "common.h"
|
||||||
|
#include <altivec.h>
|
||||||
|
|
||||||
|
typedef __vector unsigned char vec_t;
|
||||||
|
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||||
|
|
||||||
|
#define SET_ACC_ZERO() \
|
||||||
|
__builtin_mma_xxsetaccz (&acc0); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc1); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc2); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc3); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc4); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc5); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc6); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc7);
|
||||||
|
|
||||||
|
#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
|
||||||
|
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; }
|
||||||
|
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
|
||||||
|
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; }
|
||||||
|
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
|
||||||
|
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; }
|
||||||
|
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined(RR) || defined(RC) || defined(CR) || defined(CC))
|
||||||
|
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; }
|
||||||
|
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
#define A_OP =
|
||||||
|
#else
|
||||||
|
#define A_OP +=
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
|
||||||
|
|
||||||
|
#define SAVE_ACC_COMPLEX_11 \
|
||||||
|
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||||
|
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||||
|
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
|
||||||
|
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||||
|
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
|
||||||
|
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||||
|
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
|
||||||
|
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||||
|
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
|
||||||
|
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||||
|
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
|
||||||
|
|
||||||
|
#define SAVE_ACC_COMPLEX_12 \
|
||||||
|
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||||
|
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||||
|
COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \
|
||||||
|
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||||
|
COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \
|
||||||
|
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||||
|
COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \
|
||||||
|
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||||
|
COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \
|
||||||
|
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||||
|
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||||
|
CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||||
|
CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
|
||||||
|
|
||||||
|
#define SAVE_ACC_COMPLEX_21_1 \
|
||||||
|
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||||
|
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||||
|
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
|
||||||
|
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
|
||||||
|
COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \
|
||||||
|
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||||
|
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
|
||||||
|
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
|
||||||
|
COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \
|
||||||
|
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||||
|
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
|
||||||
|
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
|
||||||
|
COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \
|
||||||
|
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||||
|
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
|
||||||
|
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
|
||||||
|
COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \
|
||||||
|
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||||
|
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||||
|
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||||
|
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
|
||||||
|
|
||||||
|
#define SAVE_ACC_COMPLEX_21_2 \
|
||||||
|
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||||
|
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||||
|
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
|
||||||
|
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
|
||||||
|
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
|
||||||
|
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||||
|
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
|
||||||
|
COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \
|
||||||
|
COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \
|
||||||
|
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||||
|
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
|
||||||
|
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
|
||||||
|
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
|
||||||
|
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||||
|
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
|
||||||
|
COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \
|
||||||
|
COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \
|
||||||
|
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||||
|
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||||
|
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||||
|
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||||
|
CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||||
|
CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||||
|
CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||||
|
CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||||
|
|
||||||
|
#define SAVE_ACC_COMPLEX_21_4 \
|
||||||
|
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||||
|
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||||
|
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
|
||||||
|
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
|
||||||
|
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
|
||||||
|
COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \
|
||||||
|
COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \
|
||||||
|
COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \
|
||||||
|
COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \
|
||||||
|
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||||
|
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
|
||||||
|
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
|
||||||
|
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
|
||||||
|
COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \
|
||||||
|
COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \
|
||||||
|
COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \
|
||||||
|
COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \
|
||||||
|
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||||
|
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||||
|
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||||
|
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||||
|
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||||
|
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||||
|
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||||
|
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \
|
||||||
|
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \
|
||||||
|
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \
|
||||||
|
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \
|
||||||
|
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \
|
||||||
|
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \
|
||||||
|
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \
|
||||||
|
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \
|
||||||
|
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
|
||||||
|
|
||||||
|
#define SAVE_ACC_COMPLEX_22_1 \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
|
||||||
|
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
|
||||||
|
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
|
||||||
|
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
|
||||||
|
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \
|
||||||
|
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||||
|
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||||
|
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||||
|
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||||
|
CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||||
|
CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||||
|
CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||||
|
CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||||
|
|
||||||
|
#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, ACC1); \
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
|
||||||
|
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
|
||||||
|
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
|
||||||
|
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
|
||||||
|
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
|
||||||
|
CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||||
|
CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||||
|
CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||||
|
CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||||
|
CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||||
|
CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||||
|
CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||||
|
CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||||
|
|
||||||
|
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||||
|
|
||||||
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||||
|
#define REFRESH_TEMP_BK(x, y) \
|
||||||
|
temp = k - off;
|
||||||
|
#elif defined(LEFT)
|
||||||
|
#define REFRESH_TEMP_BK(x, y) \
|
||||||
|
temp = off + x;
|
||||||
|
#else
|
||||||
|
#define REFRESH_TEMP_BK(x, y) \
|
||||||
|
temp = off + y;
|
||||||
|
#endif
|
||||||
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||||
|
#define REFRESH_POINTERS(x, y) \
|
||||||
|
BO = B; \
|
||||||
|
REFRESH_TEMP_BK(x, y)
|
||||||
|
#else
|
||||||
|
#define REFRESH_POINTERS(x, y) \
|
||||||
|
AO += off * (2*x); \
|
||||||
|
BO = B + off * (2*y); \
|
||||||
|
REFRESH_TEMP_BK(x, y)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LEFT
|
||||||
|
#define REFRESH_OFF(x) \
|
||||||
|
off += x;
|
||||||
|
#else
|
||||||
|
#define REFRESH_OFF(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LEFT
|
||||||
|
#define UPDATE_TEMP(x, y) \
|
||||||
|
temp -= x;
|
||||||
|
#else
|
||||||
|
#define UPDATE_TEMP(x, y) \
|
||||||
|
temp -= y;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||||
|
#define REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||||
|
temp = k - off; \
|
||||||
|
UPDATE_TEMP(x, y) \
|
||||||
|
AO += temp * (2*x); \
|
||||||
|
BO += temp * (2*y);
|
||||||
|
#else
|
||||||
|
#define REFRESH_TMP_AFTER_SAVE(x, y)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define REFRESH_AFTER_SAVE(x,y) \
|
||||||
|
REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||||
|
REFRESH_OFF(x)
|
||||||
|
/*************************************************************************************
|
||||||
|
* GEMM Kernel
|
||||||
|
*************************************************************************************/
|
||||||
|
int
|
||||||
|
#ifdef TRMMKERNEL
|
||||||
|
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
|
||||||
|
#else
|
||||||
|
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
BLASLONG i1, i, l, temp;
|
||||||
|
FLOAT *AO, *BO, *CO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
BLASLONG off;
|
||||||
|
#endif
|
||||||
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
|
off = -offset;
|
||||||
|
#endif
|
||||||
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
|
|
||||||
|
v4sf_t result[32];
|
||||||
|
FLOAT *res, tr[16], ti[16];
|
||||||
|
res = (FLOAT *) result;
|
||||||
|
|
||||||
|
for (i1 = 0; i1 < (n >> 1); i1++) {
|
||||||
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
|
off = offset;
|
||||||
|
#endif
|
||||||
|
AO = A;
|
||||||
|
CO = C;
|
||||||
|
C += ldc<<2;
|
||||||
|
for (i = 0; i < (m >> 3); i++) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (8, 2)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
|
||||||
|
}
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc0);
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
|
||||||
|
__builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
|
||||||
|
COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
|
||||||
|
COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
|
||||||
|
COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
|
||||||
|
COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
|
||||||
|
COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
|
||||||
|
COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
|
||||||
|
COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
|
||||||
|
COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
|
||||||
|
COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
|
||||||
|
COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
|
||||||
|
COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
|
||||||
|
COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
|
||||||
|
COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
|
||||||
|
COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
|
||||||
|
COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
|
||||||
|
COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
|
||||||
|
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
|
||||||
|
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
|
||||||
|
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
|
||||||
|
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
|
||||||
|
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
|
||||||
|
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
|
||||||
|
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
|
||||||
|
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||||
|
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
|
||||||
|
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
|
||||||
|
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
|
||||||
|
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
|
||||||
|
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
|
||||||
|
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
|
||||||
|
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
|
||||||
|
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
|
||||||
|
CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
|
||||||
|
CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
|
||||||
|
CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
|
||||||
|
CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
|
||||||
|
CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
|
||||||
|
CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
|
||||||
|
CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
|
||||||
|
CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
|
||||||
|
CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
|
||||||
|
CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
|
||||||
|
CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
|
||||||
|
CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
|
||||||
|
CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
|
||||||
|
CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
|
||||||
|
CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
|
||||||
|
CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
|
||||||
|
|
||||||
|
AO += temp << 4;
|
||||||
|
BO += temp << 2;
|
||||||
|
CO += 16;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (8, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (m & 4) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (4, 2)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
|
||||||
|
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||||
|
}
|
||||||
|
for (l = (temp & (~1)); l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
|
||||||
|
}
|
||||||
|
SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
|
||||||
|
SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
|
||||||
|
AO += temp << 3;
|
||||||
|
BO += temp << 2;
|
||||||
|
CO += 8;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (4, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (m & 2) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (2, 2)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
|
||||||
|
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
|
||||||
|
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
|
||||||
|
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
|
||||||
|
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
|
||||||
|
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||||
|
}
|
||||||
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||||
|
}
|
||||||
|
SAVE_ACC_COMPLEX_22_1
|
||||||
|
AO += temp << 2;
|
||||||
|
BO += temp << 2;
|
||||||
|
CO += 4;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (2, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (m & 1) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (1, 2)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
|
||||||
|
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
|
||||||
|
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
|
||||||
|
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
|
||||||
|
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
|
||||||
|
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||||
|
}
|
||||||
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||||
|
}
|
||||||
|
SAVE_ACC_COMPLEX_12
|
||||||
|
AO += temp << 1;
|
||||||
|
BO += temp << 2;
|
||||||
|
CO += 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (1, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
|
off += 2; // number of values in A
|
||||||
|
#endif
|
||||||
|
B += k << 2;
|
||||||
|
}
|
||||||
|
if (n & 1) {
|
||||||
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
|
off = offset;
|
||||||
|
#endif
|
||||||
|
AO = A;
|
||||||
|
CO = C;
|
||||||
|
C += ldc<<1;
|
||||||
|
for (i = 0; i < (m >> 3); i++) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (8, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
|
||||||
|
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
|
||||||
|
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
|
||||||
|
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
|
||||||
|
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
|
||||||
|
}
|
||||||
|
for (l = (temp & (~1)); l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
|
||||||
|
}
|
||||||
|
SAVE_ACC_COMPLEX_21_4
|
||||||
|
|
||||||
|
AO += temp << 4;
|
||||||
|
BO += temp << 1;
|
||||||
|
CO += 16;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (8, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (m & 4) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (4, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
|
||||||
|
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
|
||||||
|
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
|
||||||
|
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
|
||||||
|
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||||
|
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
|
||||||
|
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
|
||||||
|
}
|
||||||
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||||
|
}
|
||||||
|
SAVE_ACC_COMPLEX_21_2
|
||||||
|
AO += temp << 3;
|
||||||
|
BO += temp << 1;
|
||||||
|
CO += 8;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (4, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (m & 2) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (2, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
|
||||||
|
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
|
||||||
|
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
|
||||||
|
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
|
||||||
|
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||||
|
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
|
||||||
|
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
|
||||||
|
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
|
||||||
|
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
|
||||||
|
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
|
||||||
|
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||||
|
}
|
||||||
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
}
|
||||||
|
SAVE_ACC_COMPLEX_21_1
|
||||||
|
AO += temp << 2;
|
||||||
|
BO += temp << 1;
|
||||||
|
CO += 4;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (2, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (m & 1) {
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (1, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
SET_ACC_ZERO()
|
||||||
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||||
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||||
|
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
|
||||||
|
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
|
||||||
|
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
|
||||||
|
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
|
||||||
|
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||||
|
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
|
||||||
|
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
|
||||||
|
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
|
||||||
|
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
|
||||||
|
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
|
||||||
|
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||||
|
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||||
|
}
|
||||||
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
}
|
||||||
|
SAVE_ACC_COMPLEX_11
|
||||||
|
AO += temp << 1;
|
||||||
|
BO += temp << 1;
|
||||||
|
CO += 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (1, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
|
off += 1; // number of values in A
|
||||||
|
#endif
|
||||||
|
B += k << 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c
|
||||||
CASUMKERNEL = ../arm/zasum.c
|
CASUMKERNEL = ../arm/zasum.c
|
||||||
ZASUMKERNEL = zasum.c
|
ZASUMKERNEL = zasum.c
|
||||||
|
|
||||||
SSUMKERNEL = ../arm/asum.c
|
SSUMKERNEL = ../arm/sum.c
|
||||||
DSUMKERNEL = dasum.c
|
DSUMKERNEL = dsum.c
|
||||||
CSUMKERNEL = ../arm/zasum.c
|
CSUMKERNEL = ../arm/zsum.c
|
||||||
ZSUMKERNEL = zasum.c
|
ZSUMKERNEL = zsum.c
|
||||||
|
|
||||||
SAXPYKERNEL = ../arm/axpy.c
|
SAXPYKERNEL = ../arm/axpy.c
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
|
|
|
@ -21,7 +21,16 @@ endif()
|
||||||
if (BUILD_COMPLEX16)
|
if (BUILD_COMPLEX16)
|
||||||
list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
|
list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
|
||||||
endif()
|
endif()
|
||||||
message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID})
|
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if (BUILD_COMPLEX)
|
||||||
|
list (APPEND OpenBLAS_Tests cblat3_3m)
|
||||||
|
endif ()
|
||||||
|
if (BUILD_COMPLEX16)
|
||||||
|
list (APPEND OpenBLAS_Tests zblat3_3m)
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
foreach(test_bin ${OpenBLAS_Tests})
|
foreach(test_bin ${OpenBLAS_Tests})
|
||||||
add_executable(${test_bin} ${test_bin}.f)
|
add_executable(${test_bin} ${test_bin}.f)
|
||||||
target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
|
target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
|
||||||
|
@ -82,4 +91,10 @@ add_test(NAME "${float_type}blas2"
|
||||||
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
|
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
|
||||||
add_test(NAME "${float_type}blas3"
|
add_test(NAME "${float_type}blas3"
|
||||||
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
|
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z"))
|
||||||
|
add_test(NAME "${float_type}blas3_3m"
|
||||||
|
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3_3m> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
|
@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN)
|
||||||
override FFLAGS += -fno-tree-vectorize
|
override FFLAGS += -fno-tree-vectorize
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
SUPPORT_GEMM3M = 0
|
||||||
|
|
||||||
|
ifeq ($(ARCH), x86)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), x86_64)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), ia64)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), MIPS)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(NOFORTRAN),1)
|
ifeq ($(NOFORTRAN),1)
|
||||||
all ::
|
all ::
|
||||||
else
|
else
|
||||||
|
@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1)
|
||||||
D3=dblat3
|
D3=dblat3
|
||||||
endif
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX),1)
|
ifeq ($(BUILD_COMPLEX),1)
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
C3=cblat3 cblat3_3m
|
||||||
|
else
|
||||||
C3=cblat3
|
C3=cblat3
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX16),1)
|
ifeq ($(BUILD_COMPLEX16),1)
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
Z3=zblat3 zblat3_3m
|
||||||
|
else
|
||||||
Z3=zblat3
|
Z3=zblat3
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
|
level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
|
||||||
|
|
||||||
|
|
|
@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n,
|
||||||
srand_generate(data_cgemv_t.y_test, m * inc_y * 2);
|
srand_generate(data_cgemv_t.y_test, m * inc_y * 2);
|
||||||
|
|
||||||
// Copy vector y for reference funcs
|
// Copy vector y for reference funcs
|
||||||
for (int i = 0; i < m * inc_y * 2; i++) {
|
for (i = 0; i < m * inc_y * 2; i++) {
|
||||||
data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i];
|
data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1129,4 +1129,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major)
|
||||||
int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
|
int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
|
||||||
ASSERT_EQUAL(TRUE, passed);
|
ASSERT_EQUAL(TRUE, passed);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint
|
||||||
char trans = 'N';
|
char trans = 'N';
|
||||||
|
|
||||||
// Symmetric band packed matrix for sbmv
|
// Symmetric band packed matrix for sbmv
|
||||||
float a[lda * n * 2];
|
float *a = (float*) malloc(lda * n * 2 * sizeof(float));
|
||||||
|
|
||||||
// Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test
|
// Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test
|
||||||
srand_generate(data_csbmv.sp_matrix, n * (n + 1));
|
srand_generate(data_csbmv.sp_matrix, n * (n + 1));
|
||||||
|
@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint
|
||||||
// Find the differences between output vector caculated by csbmv and cgemv
|
// Find the differences between output vector caculated by csbmv and cgemv
|
||||||
for (i = 0; i < n * inc_c * 2; i++)
|
for (i = 0; i < n * inc_c * 2; i++)
|
||||||
data_csbmv.c_test[i] -= data_csbmv.c_verify[i];
|
data_csbmv.c_test[i] -= data_csbmv.c_verify[i];
|
||||||
|
free(a);
|
||||||
// Find the norm of differences
|
// Find the norm of differences
|
||||||
return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c);
|
return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c);
|
||||||
}
|
}
|
||||||
|
@ -603,4 +603,4 @@ CTEST(csbmv, xerbla_lda_invalid)
|
||||||
int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
|
int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
|
||||||
ASSERT_EQUAL(TRUE, passed);
|
ASSERT_EQUAL(TRUE, passed);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){
|
||||||
CTEST(idamin, min_idx_in_vec_tail_inc_1){
|
CTEST(idamin, min_idx_in_vec_tail_inc_1){
|
||||||
blasint i;
|
blasint i;
|
||||||
blasint N = ELEMENTS, inc = 1;
|
blasint N = ELEMENTS, inc = 1;
|
||||||
double x[ELEMENTS * inc];
|
double *x = (double*)malloc(ELEMENTS * inc * sizeof(double));
|
||||||
for (i = 0; i < N * inc; i ++) {
|
for (i = 0; i < N * inc; i ++) {
|
||||||
x[i] = i + 1000;
|
x[i] = i + 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
x[(N - 1) * inc] = 0.0f;
|
x[(N - 1) * inc] = 0.0f;
|
||||||
blasint index = BLASFUNC(idamin)(&N, x, &inc);
|
blasint index = BLASFUNC(idamin)(&N, x, &inc);
|
||||||
|
free(x);
|
||||||
ASSERT_EQUAL(N, index);
|
ASSERT_EQUAL(N, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){
|
||||||
CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){
|
CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){
|
||||||
blasint i;
|
blasint i;
|
||||||
blasint N = ELEMENTS, inc = 1;
|
blasint N = ELEMENTS, inc = 1;
|
||||||
double x[ELEMENTS * inc];
|
double *x = (double*) malloc(ELEMENTS * inc * sizeof(double));
|
||||||
for (i = 0; i < N * inc; i ++) {
|
for (i = 0; i < N * inc; i ++) {
|
||||||
x[i] = i + 1000;
|
x[i] = i + 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
x[(N - 1) * inc] = 0.0;
|
x[(N - 1) * inc] = 0.0;
|
||||||
blasint index = cblas_idamin(N, x, inc);
|
blasint index = cblas_idamin(N, x, inc);
|
||||||
|
free(x);
|
||||||
ASSERT_EQUAL(N - 1, index);
|
ASSERT_EQUAL(N - 1, index);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){
|
||||||
CTEST(isamin, min_idx_in_vec_tail_inc_1){
|
CTEST(isamin, min_idx_in_vec_tail_inc_1){
|
||||||
blasint i;
|
blasint i;
|
||||||
blasint N = ELEMENTS, inc = 1;
|
blasint N = ELEMENTS, inc = 1;
|
||||||
float x[ELEMENTS * inc];
|
float *x = (float*) malloc(ELEMENTS * inc * sizeof(float));
|
||||||
for (i = 0; i < N * inc; i ++) {
|
for (i = 0; i < N * inc; i ++) {
|
||||||
x[i] = i + 1000;
|
x[i] = i + 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
x[(N - 1) * inc] = 0.0f;
|
x[(N - 1) * inc] = 0.0f;
|
||||||
blasint index = BLASFUNC(isamin)(&N, x, &inc);
|
blasint index = BLASFUNC(isamin)(&N, x, &inc);
|
||||||
|
free(x);
|
||||||
ASSERT_EQUAL(N, index);
|
ASSERT_EQUAL(N, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){
|
||||||
CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){
|
CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){
|
||||||
blasint i;
|
blasint i;
|
||||||
blasint N = ELEMENTS, inc = 1;
|
blasint N = ELEMENTS, inc = 1;
|
||||||
float x[ELEMENTS * inc];
|
float *x = (float*)malloc(ELEMENTS * inc * sizeof(float));
|
||||||
for (i = 0; i < N * inc; i ++) {
|
for (i = 0; i < N * inc; i ++) {
|
||||||
x[i] = i + 1000;
|
x[i] = i + 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
x[(N - 1) * inc] = 0.0f;
|
x[(N - 1) * inc] = 0.0f;
|
||||||
blasint index = cblas_isamin(N, x, inc);
|
blasint index = cblas_isamin(N, x, inc);
|
||||||
|
free(x);
|
||||||
ASSERT_EQUAL(N - 1, index);
|
ASSERT_EQUAL(N - 1, index);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n
|
||||||
drand_generate(data_zgemv_t.y_test, m * inc_y * 2);
|
drand_generate(data_zgemv_t.y_test, m * inc_y * 2);
|
||||||
|
|
||||||
// Copy vector y for reference funcs
|
// Copy vector y for reference funcs
|
||||||
for (int i = 0; i < m * inc_y * 2; i++)
|
for (i = 0; i < m * inc_y * 2; i++)
|
||||||
{
|
{
|
||||||
data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i];
|
data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i];
|
||||||
}
|
}
|
||||||
|
@ -1133,4 +1133,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major)
|
||||||
int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
|
int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
|
||||||
ASSERT_EQUAL(TRUE, passed);
|
ASSERT_EQUAL(TRUE, passed);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin
|
||||||
char trans = 'N';
|
char trans = 'N';
|
||||||
|
|
||||||
// Symmetric band packed matrix for sbmv
|
// Symmetric band packed matrix for sbmv
|
||||||
double a[lda * n * 2];
|
double *a = (double*) malloc(lda * n * 2 * sizeof(double));
|
||||||
|
|
||||||
// Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test
|
// Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test
|
||||||
drand_generate(data_zsbmv.sp_matrix, n * (n + 1));
|
drand_generate(data_zsbmv.sp_matrix, n * (n + 1));
|
||||||
|
@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin
|
||||||
BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda,
|
BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda,
|
||||||
data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c);
|
data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c);
|
||||||
|
|
||||||
|
free(a);
|
||||||
// Find the differences between output vector caculated by zsbmv and zgemv
|
// Find the differences between output vector caculated by zsbmv and zgemv
|
||||||
for (i = 0; i < n * inc_c * 2; i++)
|
for (i = 0; i < n * inc_c * 2; i++)
|
||||||
data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i];
|
data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i];
|
||||||
|
@ -603,4 +604,4 @@ CTEST(zsbmv, xerbla_lda_invalid)
|
||||||
int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
|
int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
|
||||||
ASSERT_EQUAL(TRUE, passed);
|
ASSERT_EQUAL(TRUE, passed);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue