Merge branch 'develop' of https://github.com/openmathlib/openblas into develop

This commit is contained in:
Chip-Kerchner 2024-03-01 07:43:41 -06:00
commit edb7ab5ccf
31 changed files with 12191 additions and 69 deletions

View File

@ -42,6 +42,7 @@ jobs:
- name: Install Dependencies
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get update
sudo apt-get install -y gfortran cmake ccache libtinfo5
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.

View File

@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
@ -328,7 +330,7 @@ if (NOT NOFORTRAN)
# Build test and ctest
add_subdirectory(test)
endif()
if (BUILD_TESTING)
if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
add_subdirectory(lapack-netlib/TESTING)
endif()
endif()
@ -458,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
endif()
endif()
if (BUILD_BENCHMARKS)
#find_package(OpenMP REQUIRED)
file(GLOB SOURCES "benchmark/*.c")
if (NOT USE_OPENMP)
file(GLOB REMFILE "benchmark/smallscaling.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
if (BUILD_WITHOUT_LAPACK)
file(GLOB REMFILE "benchmark/cholesky.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/geev.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/gesv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/getri.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/potrf.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/spmv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/symv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/linpack.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
if (NOT USE_GEMM3M)
file(GLOB REMFILE "benchmark/gemm3m.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
foreach(source ${SOURCES})
get_filename_component(name ${source} NAME_WE)
if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
foreach(define ${defines})
set(target_name "benchmark_${name}")
if (NOT "${define}" STREQUAL "DEFAULT")
string(JOIN "_" define_str ${define})
set(target_name "${target_name}_${define_str}")
endif()
if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
add_executable(${target_name} ${source})
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
# target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
if (NOT "${define}" STREQUAL "DEFAULT")
target_compile_definitions(${target_name} PRIVATE ${define})
endif()
endif()
endforeach()
endif()
endforeach()
endif()
# Install project

View File

@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX
LIBNAMEPREFIX =
endif
SYMPREFIX=$(SYMBOLPREFIX)
ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
SYMPREFIX=
endif
SYMSUFFIX=$(SYMBOLSUFFIX)
ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
SYMSUFFIX=
endif
ifndef LIBNAMESUFFIX
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
else
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX)
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
endif
ifeq ($(OSNAME), CYGWIN_NT)

View File

@ -88,6 +88,17 @@ if (NOT NOFORTRAN)
auxiliary.c
c_xerbla.c
constant.c)
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_executable(x${float_char}cblat3_3m
c_${float_char}blat3_3m.f
c_${float_char}blas3_3m.c
c_${float_char}3chke_3m.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
endif()
else()
add_executable(x${float_char}cblat3
c_${float_char}blat3c.c
@ -96,6 +107,17 @@ else()
auxiliary.c
c_xerbla.c
constant.c)
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_executable(x${float_char}cblat3_3m
c_${float_char}blat3c_3m.c
c_${float_char}blas3_3m.c
c_${float_char}3chke_3m.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
endif()
endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
@ -105,7 +127,24 @@ endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3 m)
endif()
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3_3m m)
endif()
endif()
endif()
add_test(NAME "x${float_char}cblat3"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_test(NAME "x${float_char}cblat3_3m"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
endif()
endif()
endforeach()

View File

@ -5,6 +5,24 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
SUPPORT_GEMM3M = 0
ifeq ($(ARCH), x86)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), x86_64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), ia64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), MIPS)
SUPPORT_GEMM3M = 1
endif
override CFLAGS += -DADD$(BU) -DCBLAS
ifeq ($(F_COMPILER),GFORTRAN)
override FFLAGS += -fno-tree-vectorize
@ -144,9 +162,15 @@ all3targets += xdcblat3
endif
ifeq ($(BUILD_COMPLEX),1)
all3targets += xccblat3
ifeq ($(SUPPORT_GEMM3M),1)
all3targets += xccblat3_3m
endif
endif
ifeq ($(BUILD_COMPLEX16),1)
all3targets += xzcblat3
ifeq ($(SUPPORT_GEMM3M),1)
all3targets += xzcblat3_3m
endif
endif
all3: $(all3targets)
@ -181,9 +205,9 @@ endif
endif
endif
all3_3m: xzcblat3_3m xccblat3_3m
ifeq ($(SUPPORT_GEMM3M),1)
ifeq ($(USE_OPENMP), 1)
ifeq ($(BUILD_SINGLE),1)
ifeq ($(BUILD_COMPLEX),1)
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
endif
ifeq ($(BUILD_COMPLEX16),1)
@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1)
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
endif
endif
@ -271,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
ifeq ($(SUPPORT_GEMM3M),1)
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
else
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
@ -280,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
ifeq ($(SUPPORT_GEMM3M),1)
xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
endif
endif
@ -293,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
ifeq ($(SUPPORT_GEMM3M),1)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
else
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
@ -302,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
ifeq ($(SUPPORT_GEMM3M),1)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
endif
endif

3942
ctest/c_cblat3c_3m.c Normal file

File diff suppressed because it is too large Load Diff

3951
ctest/c_zblat3c_3m.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S
CSUMKERNEL = csum_lsx.S
ZSUMKERNEL = csum_lsx.S
SGEMVNKERNEL = sgemv_n_lsx.S
SGEMVTKERNEL = sgemv_t_lsx.S
DGEMVNKERNEL = dgemv_n_lsx.S
DGEMVTKERNEL = dgemv_t_lsx.S
DGEMMKERNEL = dgemm_kernel_8x4.S
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
DGEMMITCOPY = dgemm_tcopy_8_lsx.S
@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMVNKERNEL = cgemv_n_4_lsx.S
CGEMVTKERNEL = cgemv_t_4_lsx.S
CGEMMKERNEL = cgemm_kernel_8x4_lsx.S
CGEMMINCOPY = cgemm_ncopy_8_lsx.S
CGEMMITCOPY = cgemm_tcopy_8_lsx.S
@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZGEMVNKERNEL = zgemv_n_2_lsx.S
ZGEMVTKERNEL = zgemv_t_2_lsx.S
ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S
ZGEMMONCOPY = zgemm_ncopy_4_lsx.S
ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S

View File

@ -0,0 +1,323 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M8 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $vr1
#define X0 $vr2
#define X1 $vr3
#define X2 $vr4
#define X3 $vr5
#define X4 $vr6
#define X5 $vr7
#define X6 $vr8
#define X7 $vr9
#define Y0 $vr10
#define Y1 $vr11
#define A0 $vr12
#define A1 $vr13
#define A2 $vr14
#define A3 $vr15
#define A4 $vr16
#define A5 $vr17
#define A6 $vr18
#define A7 $vr19
#define A8 $vr20
#define A9 $vr21
#define A10 $vr22
#define A11 $vr23
#define A12 $vr24
#define A13 $vr25
#define A14 $vr26
#define A15 $vr27
#define TMP0 $vr28
#define TMP1 $vr29
#define TMP2 $vr30
#if !defined(CONJ)
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 0
#else
#define GXCONJ 1
#define GCONJ 0
#endif
#else
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 1
#else
#define GXCONJ 1
#define GCONJ 1
#endif
#endif
.macro CLOAD_X_4
GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
GCOMPLEXMUL GXCONJ, \
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro CLOAD_X_4_GAP
vldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
vldrepl.d X1, T0, 0x00
PTR_ADD T0, T0, INC_X
vldrepl.d X2, T0, 0x00
PTR_ADD T0, T0, INC_X
vldrepl.d X3, T0, 0x00
GCOMPLEXMUL GXCONJ, \
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro CLOAD_X_1
GLDREPL v, d, X0, X, 0x00
GCOMPLEXMUL GXCONJ, \
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_4
GLD v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro CLOAD_Y_4_GAP
fld.d $f10, Y, 0
fldx.d $f13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
fld.d $f11, T0, 0
fldx.d $f17, T0, INC_Y
vpackev.d Y0, A1, Y0
vpackev.d Y1, A5, Y1
.endm
.macro CLOAD_Y_1
fld.d $f10, Y, 0
.endm
.macro CSTORE_Y_4
GST v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro CSTORE_Y_4_GAP
vstelm.d Y0, Y, 0, 0
PTR_ADD T0, Y, INC_Y
vstelm.d Y0, T0, 0, 1
PTR_ADD T0, T0, INC_Y
vstelm.d Y1, T0, 0, 0
PTR_ADD T0, T0, INC_Y
vstelm.d Y1, T0, 0, 1
.endm
.macro CSTORE_Y_1
fst.d $f10, Y, 0
.endm
.macro CGEMV_N_4x4
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
.endm
.macro CGEMV_N_1x4
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2
.endm
.macro CGEMV_N_1x1
fld.d $f12, PA0, 0
PTR_ADDI PA0, PA0, 0x08
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
.endm
.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
PTR_SRLI J, N, 2
beqz J, .L_\XW\()_N_3
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L4:
CLOAD_\X_4
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 2
beqz I, .L_\XW\()_M_3
.align 5
.L_\XW\()_M_L4:
CLOAD_\Y_4
CGEMV_N_4x4
CSTORE_\Y_4
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
bnez I, .L_\XW\()_M_L4
.L_\XW\()_M_3:
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
CLOAD_\Y_1
CGEMV_N_1x4
CSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
PTR_ALSL X, INC_X, X, 2
bnez J, .L_\XW\()_N_L4
.L_\XW\()_N_3:
andi J, N, 3
beqz J, .L_END
.L_\XW\()_N_L1:
CLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
CLOAD_\Y_1
CGEMV_N_1x1
CSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
PTR_SUB K_LDA, LDA, M8
PTR_ADD PA0, PA0, K_LDA
PTR_ADD X, X, INC_X
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
// Init VALPHA
vpackev.w $vr0, $vr1, $vr0
vpackev.d VALPHA, $vr0, $vr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0 // Obtain the offset address
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
X7, X7, VALPHA, TMP0, TMP1, TMP2
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
X7, VALPHA, X7, TMP0, TMP1, TMP2
.endm
.macro CLOAD_X_8_GAP
@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvldrepl.d X7, T0, 0x00
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
X7, X7, VALPHA, TMP0, TMP1, TMP2
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
X7, VALPHA, X7, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_8
@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro CLOAD_X_1
GLDREPL xv, d, X0, X, 0x00
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_1

View File

@ -0,0 +1,290 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M8 $r30
#define VALPHA $vr0
#define X0 $vr1
#define X1 $vr2
#define A0 $vr3
#define A1 $vr4
#define A2 $vr5
#define A3 $vr6
#define A4 $vr7
#define A5 $vr8
#define A6 $vr9
#define A7 $vr10
#define A8 $vr11
#define A9 $vr12
#define A10 $vr13
#define A11 $vr14
#define A12 $vr15
#define A13 $vr16
#define A14 $vr17
#define A15 $vr18
#define TP0 $vr19
#define TP1 $vr20
#define TP2 $vr21
#define TP3 $vr22
#define TP4 $vr23
#define TP5 $vr24
#define TP6 $vr25
#define TP7 $vr26
#define TMP0 $vr27
#define TMP1 $vr28
#define TMP2 $vr29
#define Y0 $vr3
#define Y1 $vr4
#define Y2 $vr5
#define Y3 $vr6
#define Y4 $vr7
#define Y5 $vr8
#define Y6 $vr9
#define Y7 $vr10
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
#define GXCONJ1 0
#define GCONJ1 0
#else
#define GXCONJ1 1
#define GCONJ1 0
#endif
#if !defined(XCONJ)
#define GXCONJ2 0
#define GCONJ2 0
#else
#define GXCONJ2 0
#define GCONJ2 1
#endif
.macro ZERO_Y4
GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
.endm
.macro ZERO_Y1
GXOR v, v, TP0, TP0, TP0
.endm
.macro CLOAD_X4
GLD v, , X0, X, 0x00, X1, X, 0x10
.endm
.macro CLOAD_X4_GAP
fld.d $f1, X, 0x00
fldx.d $f3, X, INC_X
PTR_ALSL T0, INC_X, X, 1
fld.d $f2, T0, 0x00
fldx.d $f4, T0, INC_X
vpackev.d X0, A0, X0
vpackev.d X1, A1, X1
.endm
.macro CGEMV_T_4x4
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
.endm
.macro CGEMV_T_LSX XW:req, X4:req
PTR_SRLI J, N, 2
beqz J, .L_\XW\()_N_3
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L4:
ZERO_Y4
move X, X_ORG
PTR_SRLI I, M, 2
beqz I, .L_\XW\()_M_3
.align 5
.L_\XW\()_M_L4:
CLOAD_\X4
CGEMV_T_4x4
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 2
bnez I, .L_\XW\()_M_L4
.L_\XW\()_M_3:
// Accumulated
GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
fld.d $f1, X, 0x00
fld.d $f11, PA0, 0x00
fld.d $f12, PA1, 0x00
fld.d $f13, PA2, 0x00
fld.d $f14, PA3, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
#elif __loongarch_grlen == 32
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
#else
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
#endif
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
fld.d $f13, PY0, 0x00
fldx.d $f14, PY0, INC_Y
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
fst.d $f13, PY0, 0x00
fstx.d $f14, PY0, INC_Y
PTR_ALSL Y, INC_Y, Y, 2
bnez J, .L_\XW\()_N_L4
.L_\XW\()_N_3:
andi J, N, 3
beqz J, .L_END
PTR_SUB K_LDA, LDA, M8
.L_\XW\()_N_1:
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
fld.d $f3, PA0, 0x00
fld.d $f1, X, 0x00
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x08
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
fld.d $f3, Y, 0x00
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
fst.d $f3, Y, 0x00
PTR_ADD PA0, PA0, K_LDA
PTR_ADD Y, Y, INC_Y
bnez J, .L_\XW\()_N_1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
// Init VALPHA
vpackev.w $vr0, $vr1, $vr0
vpackev.d VALPHA, $vr0, $vr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
CGEMV_T_LSX GAP_0, X4
.L_GAP_1: /* if (incx != 1) */
CGEMV_T_LSX GAP_1, X4_GAP
.L_END:
pop_if_used 17 + 8, 30
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,229 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Param */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define ALPHA $f0
#define YORIG $r18
#define T0 $r19
#define T1 $r20
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define IX $r25
#define IY $r26
#define II $r27
#define T2 $r28
#define T3 $r29
#define T4 $r30
/* LSX vectors */
#define U0 $vr11
#define U1 $vr12
#define U2 $vr2
#define U3 $vr3
#define U4 $vr4
#define U5 $vr5
#define U6 $vr6
#define U7 $vr7
#define U8 $vr8
#define U9 $vr9
#define VALPHA $vr10
#define a1 $f3
#define a2 $f4
#define a3 $f5
#define a4 $f6
#define a5 $f7
#define a6 $f8
#define a7 $f9
#define a8 $f10
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
addi.d $sp, $sp, -80
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
ST ALPHA, $sp, 72
vldrepl.d VALPHA, $sp, 72
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
bge $r0, M, .L999
bge $r0, N, .L999
move J, $r0
move IX, $r0
move AO1, A //a_ptr
move XX, X
move YY, Y
beq J, M, .L999
.L01:
vldx U0, XX, IX
vshuf4i.d U0, U0, 0x00
vfmul.d U1, VALPHA, U0 //temp1
move IY, $r0
move II, $r0
move I, $r0
srai.d T0, M, 2 //n/4
beq I, T0, .L03
.L02:
vldx U2, AO1, II
addi.d II, II, 16
vldx U7, AO1, II
move T1, IY
add.d T2, T1, INCY
add.d T3, T2, INCY
add.d T4, T3, INCY
fldx.d a1, YY, T1
fldx.d a2, YY, T2
fldx.d a3, YY, T3
fldx.d a4, YY, T4
vextrins.d U3, U4, 0x10
vextrins.d U5, U6, 0x10
vfmadd.d U3, U1, U2, U3
vfmadd.d U5, U1, U7, U5
vextrins.d U4, U3, 0x01
vextrins.d U6, U5, 0x01
fstx.d a1, YY, T1
fstx.d a2, YY, T2
fstx.d a3, YY, T3
fstx.d a4, YY, T4
add.d IY, T4, INCY
addi.d II, II, 16
addi.d I, I, 1
blt I, T0, .L02
.L03:
andi T0, M, 2
beq $r0, T0, .L04
addi.d T1, $r0, 4
mod.d T1, M, T1
sub.d II, M, T1
slli.d II, II, BASE_SHIFT
move T1, IY
add.d T2, T1, INCY
vldx U2, AO1, II
fldx.d a1, YY, T1
fldx.d a2, YY, T2
vextrins.d U3, U4, 0x10
vfmadd.d U3, U1, U2, U3
vextrins.d U4, U3, 0x01
fstx.d a1, YY, T1
fstx.d a2, YY, T2
add.d IY, T2, INCY
.L04:
andi T0, M, 1
beq $r0, T0, .L05
addi.d II, M, -1
slli.d II, II, BASE_SHIFT
fldx.d a1, AO1, II
fldx.d a3, YY, IY
fmadd.d a3, $f12, a1, a3
fstx.d a3, YY, IY
add.d IY, IY, INCY
.L05:
add.d AO1, AO1, LDA
add.d IX, IX, INCX
addi.d J, J, 1
blt J, N, .L01
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 16
LDARG $r26, $sp, 32
LDARG $r27, $sp, 40
LDARG $r28, $sp, 48
LDARG $r29, $sp, 56
LDARG $r30, $sp, 64
LD ALPHA, $sp, 72
addi.d $sp, $sp, 80
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,279 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Param */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define ALPHA $f0
#define YORIG $r18
#define T0 $r19
#define T1 $r20
#define AO3 $r12
#define AO4 $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define IX $r25
#define IY $r26
#define II $r27
#define T2 $r28
#define T3 $r29
#define T4 $r30
/* LSX vectors */
#define U0 $vr11
#define U1 $vr12
#define U2 $vr2
#define U3 $vr3
#define U4 $vr4
#define U5 $vr5
#define U6 $vr6
#define U7 $vr7
#define U8 $vr8
#define U9 $vr9
#define VALPHA $vr10
#define a1 $f3
#define a2 $f4
#define a3 $f5
#define a4 $f6
#define a5 $f7
#define a6 $f8
#define a7 $f9
#define a8 $f10
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
addi.d $sp, $sp, -80
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
ST ALPHA, $sp, 72
vldrepl.d VALPHA, $sp, 72
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
bge $r0, M, .L999
bge $r0, N, .L999
move J, $r0
move IY, $r0
move AO1, A //a_ptr1
srai.d T0, N, 2 //n/4
beq J, T0, .L04
.L01: /* j<n/4 */
vxor.v U0, U0, U0
vxor.v U7, U7, U7
add.d AO2, AO1, LDA
add.d AO3, AO2, LDA
add.d AO4, AO3, LDA
move IX, $r0
move I, $r0
move II, $r0
beq $r0, M, .L03
.L02: /* i<m */
vldx U1, X, IX
fldx.d $f2, AO1, II
fldx.d $f3, AO2, II
fldx.d $f4, AO3, II
fldx.d $f5, AO4, II
vshuf4i.d U1, U1, 0x00
vextrins.d U2, U3, 0x10
vextrins.d U4, U5, 0x10
vfmadd.d U0, U2, U1, U0 //temp1,2
vfmadd.d U7, U4, U1, U7 //temp3,4
add.d IX, IX, INCX
addi.d II, II, 8
addi.d I, I, 1
blt I, M, .L02
.L03:
move T1, IY
add.d T2, T1, INCY
add.d T3, T2, INCY
add.d T4, T3, INCY
fldx.d $f3, Y, T1
fldx.d $f4, Y, T2
fldx.d $f5, Y, T3
fldx.d $f6, Y, T4
vextrins.d U3, U4, 0x10
vextrins.d U5, U6, 0x10
vfmadd.d U3, VALPHA, U0, U3
vfmadd.d U5, VALPHA, U7, U5
vextrins.d U4, U3, 0x01
vextrins.d U6, U5, 0x01
fstx.d $f3, Y, T1
fstx.d $f4, Y, T2
fstx.d $f5, Y, T3
fstx.d $f6, Y, T4
slli.d T1, LDA, 2
add.d AO1, AO1, T1
add.d IY, T4, INCY
addi.d J, J, 1
blt J, T0, .L01
.L04: /* if(n&2) */
andi T0, N, 2
beq $r0, T0, .L07
vxor.v U0, U0, U0
add.d AO2, AO1, LDA
move IX, $r0
move I, $r0
move II, $r0
beq $r0, M, .L06
.L05: /* i<m */
vldx U1, X, IX
fldx.d $f2, AO1, II
fldx.d $f3, AO2, II
vshuf4i.d U1, U1, 0x00
vextrins.d U2, U3, 0x10
vfmadd.d U0, U2, U1, U0 //temp1,2
add.d IX, IX, INCX
addi.d II, II, 8
addi.d I, I, 1
blt I, M, .L05
.L06:
move T1, IY
add.d T2, T1, INCY
fldx.d a1, Y, T1
fldx.d a2, Y, T2
vextrins.d U3, U4, 0x10
vfmadd.d U3, VALPHA, U0, U3
vextrins.d U4, U3, 0x01
fstx.d a1, Y, T1
fstx.d a2, Y, T2
slli.d T0, LDA, 1
add.d AO1, AO1, T0
add.d IY, T2, INCY
.L07: /* if(n&1) */
andi T0, N, 1
beq $r0, T0, .L999
MTC a1, $r0
move IX, $r0
move I, $r0
move II, $r0
beq $r0, M, .L09
.L08: /* i<m */
fldx.d a3, X, IX
fldx.d a4, AO1, II
fmadd.d a1, a4, a3, a1 //temp1
add.d IX, IX, INCX
addi.d II, II, 8
addi.d I, I, 1
blt I, M, .L08
.L09:
fldx.d a3, Y, IY
fmadd.d a3, ALPHA, a1, a3
fstx.d a3, Y, IY
add.d AO1, AO1, LDA
add.d IY, IY, INCY
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 16
LDARG $r26, $sp, 32
LDARG $r27, $sp, 40
LDARG $r28, $sp, 48
LDARG $r29, $sp, 56
LDARG $r30, $sp, 64
LD ALPHA, $sp, 72
addi.d $sp, $sp, 80
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.ifeqs "\suf_op", "s"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.else
vor.v \out, \in, \in
.endif
.endif
.ifnb \more
GCOMPLEXACC \pre_op, \suf_op, \more
.endif

View File

@ -0,0 +1,227 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Param */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define ALPHA $f0
#define YORIG $r18
#define T0 $r19
#define T1 $r20
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define IX $r25
#define IY $r26
#define II $r27
#define T2 $r28
#define T3 $r29
#define T4 $r30
/* LSX vectors */
#define U0 $vr11
#define U1 $vr12
#define U2 $vr2
#define U3 $vr3
#define U4 $vr4
#define U5 $vr5
#define U6 $vr6
#define U7 $vr7
#define U8 $vr8
#define U9 $vr9
#define VALPHA $vr10
#define a1 $f3
#define a2 $f4
#define a3 $f5
#define a4 $f6
#define a5 $f7
#define a6 $f8
#define a7 $f9
#define a8 $f10
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
addi.d $sp, $sp, -80
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
ST ALPHA, $sp, 72
vldrepl.w VALPHA, $sp, 72
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
bge $r0, M, .L999
bge $r0, N, .L999
move J, $r0
move IX, $r0
move AO1, A //a_ptr
move XX, X
move YY, Y
beq J, M, .L999
.L01:
vldx U0, XX, IX
vpermi.w U0, U0, 0x00
vfmul.s U1, VALPHA, U0 //temp1
move IY, $r0
move II, $r0
move I, $r0
srai.d T0, M, 2 //n/4
beq I, T0, .L03
.L02:
vldx U2, AO1, II
move T1, IY
add.d T2, T1, INCY
add.d T3, T2, INCY
add.d T4, T3, INCY
fldx.s a1, YY, T1
fldx.s a2, YY, T2
fldx.s a3, YY, T3
fldx.s a4, YY, T4
vextrins.w U3, U4, 0x10
vextrins.w U3, U5, 0x20
vextrins.w U3, U6, 0x30
vfmadd.s U3, U1, U2, U3
vextrins.w U4, U3, 0x01
vextrins.w U5, U3, 0x02
vextrins.w U6, U3, 0x03
fstx.s a1, YY, T1
fstx.s a2, YY, T2
fstx.s a3, YY, T3
fstx.s a4, YY, T4
add.d IY, T4, INCY
addi.d II, II, 16
addi.d I, I, 1
blt I, T0, .L02
.L03:
andi T0, M, 2
beq $r0, T0, .L04
addi.d T1, $r0, 4
mod.d T1, M, T1
sub.d II, M, T1
slli.d II, II, BASE_SHIFT
move T1, IY
add.d T2, T1, INCY
fldx.s a1, AO1, II
addi.d T0, II, 4
fldx.s a2, AO1, T0
fldx.s a3, YY, T1
fldx.s a4, YY, T2
fmadd.s a3, $f12, a1, a3
fmadd.s a4, $f12, a2, a4
fstx.s a3, YY, T1
fstx.s a4, YY, T2
add.d IY, T2, INCY
.L04:
andi T0, M, 1
beq $r0, T0, .L05
addi.d II, M, -1
slli.d II, II, BASE_SHIFT
fldx.s a1, AO1, II
fldx.s a3, YY, IY
fmadd.s a3, $f12, a1, a3
fstx.s a3, YY, IY
add.d IY, IY, INCY
.L05:
add.d AO1, AO1, LDA
add.d IX, IX, INCX
addi.d J, J, 1
blt J, N, .L01
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 16
LDARG $r26, $sp, 32
LDARG $r27, $sp, 40
LDARG $r28, $sp, 48
LDARG $r29, $sp, 56
LDARG $r30, $sp, 64
LD ALPHA, $sp, 72
addi.d $sp, $sp, 80
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,275 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Param */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define ALPHA $f0
#define YORIG $r18
#define T0 $r19
#define T1 $r20
#define AO3 $r12
#define AO4 $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define IX $r25
#define IY $r26
#define II $r27
#define T2 $r28
#define T3 $r29
#define T4 $r30
/* LSX vectors */
#define U0 $vr11
#define U1 $vr12
#define U2 $vr2
#define U3 $vr3
#define U4 $vr4
#define U5 $vr5
#define U6 $vr6
#define U7 $vr7
#define U8 $vr8
#define U9 $vr9
#define VALPHA $vr10
#define a1 $f3
#define a2 $f4
#define a3 $f5
#define a4 $f6
#define a5 $f7
#define a6 $f8
#define a7 $f9
#define a8 $f10
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
addi.d $sp, $sp, -80
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
ST ALPHA, $sp, 72
vldrepl.w VALPHA, $sp, 72
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
bge $r0, M, .L999
bge $r0, N, .L999
move J, $r0
move IY, $r0
move AO1, A //a_ptr1
srai.d T0, N, 2 //n/4
beq J, T0, .L04
.L01: /* j<n/4 */
vxor.v U0, U0, U0
add.d AO2, AO1, LDA
add.d AO3, AO2, LDA
add.d AO4, AO3, LDA
move IX, $r0
move I, $r0
move II, $r0
beq $r0, M, .L03
.L02: /* i<m */
vldx U1, X, IX
fldx.s $f2, AO1, II
fldx.s $f3, AO2, II
fldx.s $f4, AO3, II
fldx.s $f5, AO4, II
vpermi.w U1, U1, 0x00
vextrins.w U2, U3, 0x10
vextrins.w U2, U4, 0x20
vextrins.w U2, U5, 0x30
vfmadd.s U0, U2, U1, U0 //temp1,2,3,4
add.d IX, IX, INCX
addi.d II, II, 4
addi.d I, I, 1
blt I, M, .L02
.L03:
move T1, IY
add.d T2, T1, INCY
add.d T3, T2, INCY
add.d T4, T3, INCY
fldx.s a1, Y, T1
fldx.s a2, Y, T2
fldx.s a3, Y, T3
fldx.s a4, Y, T4
vextrins.w U3, U4, 0x10
vextrins.w U3, U5, 0x20
vextrins.w U3, U6, 0x30
vfmadd.s U3, VALPHA, U0, U3
vextrins.w U4, U3, 0x01
vextrins.w U5, U3, 0x02
vextrins.w U6, U3, 0x03
fstx.s a1, Y, T1
fstx.s a2, Y, T2
fstx.s a3, Y, T3
fstx.s a4, Y, T4
slli.d T1, LDA, 2
add.d AO1, AO1, T1
add.d IY, T4, INCY
addi.d J, J, 1
blt J, T0, .L01
.L04: /* if(n&2) */
andi T0, N, 2
beq $r0, T0, .L07
MTC a1, $r0
MTC a2, $r0
add.d AO2, AO1, LDA
move IX, $r0
move I, $r0
move II, $r0
beq $r0, M, .L06
.L05: /* i<m */
fldx.s a3, X, IX
fldx.s a4, AO1, II
fldx.s a5, AO2, II
fmadd.s a1, a4, a3, a1 //temp1
fmadd.s a2, a5, a3, a2 //temp2
add.d IX, IX, INCX
addi.d II, II, 4
addi.d I, I, 1
blt I, M, .L05
.L06:
move T1, IY
add.d T2, T1, INCY
fldx.s a3, Y, T1
fldx.s a4, Y, T2
fmadd.s a3, ALPHA, a1, a3
fmadd.s a4, ALPHA, a2, a4
fstx.s a3, Y, T1
fstx.s a4, Y, T2
slli.d T0, LDA, 1
add.d AO1, AO1, T0
add.d IY, T2, INCY
.L07: /* if(n&1) */
andi T0, N, 1
beq $r0, T0, .L999
MTC a1, $r0
move IX, $r0
move I, $r0
move II, $r0
beq $r0, M, .L09
.L08: /* i<m */
fldx.s a3, X, IX
fldx.s a4, AO1, II
fmadd.s a1, a4, a3, a1 //temp1
add.d IX, IX, INCX
addi.d II, II, 4
addi.d I, I, 1
blt I, M, .L08
.L09:
fldx.s a3, Y, IY
fmadd.s a3, ALPHA, a1, a3
fstx.s a3, Y, IY
add.d AO1, AO1, LDA
add.d IY, IY, INCY
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 16
LDARG $r26, $sp, 32
LDARG $r27, $sp, 40
LDARG $r28, $sp, 48
LDARG $r29, $sp, 56
LDARG $r30, $sp, 64
LD ALPHA, $sp, 72
addi.d $sp, $sp, 80
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,296 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M16 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $vr1
#define X0 $vr2
#define X1 $vr3
#define X2 $vr4
#define X3 $vr5
#define X4 $vr6
#define X5 $vr7
#define X6 $vr8
#define X7 $vr9
#define Y0 $vr10
#define Y1 $vr11
#define A0 $vr12
#define A1 $vr13
#define A2 $vr14
#define A3 $vr15
#define A4 $vr16
#define A5 $vr17
#define A6 $vr18
#define A7 $vr19
#define A8 $vr20
#define A9 $vr21
#define A10 $vr22
#define A11 $vr23
#define A12 $vr24
#define A13 $vr25
#define A14 $vr26
#define A15 $vr27
#define TMP0 $vr28
#define TMP1 $vr29
#define TMP2 $vr30
#if !defined(CONJ)
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 0
#else
#define GXCONJ 1
#define GCONJ 0
#endif
#else
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 1
#else
#define GXCONJ 1
#define GCONJ 1
#endif
#endif
.macro ZLOAD_X_2
GLD v, , X0, X, 0x00, X1, X, 0x10
GCOMPLEXMUL GXCONJ, \
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_X_2_GAP
vld X0, X, 0
PTR_ADD T0, X, INC_X
vld X1, T0, 0
GCOMPLEXMUL GXCONJ, \
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_X_1
GLD v, , X0, X, 0x00
GCOMPLEXMUL GXCONJ, \
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_Y_2
GLD v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro ZLOAD_Y_2_GAP
vld $vr10, Y, 0
vldx $vr11, Y, INC_Y
.endm
.macro ZLOAD_Y_1
vld $vr10, Y, 0
.endm
.macro ZGEMV_N_2x2
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_1x2
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_1x1
GLD_INC v, , 0x10, $vr12, PA0, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
.endm
.macro ZSTORE_Y_2
GST v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro ZSTORE_Y_2_GAP
vst Y0, Y, 0
vstx Y1, Y, INC_Y
.endm
.macro ZSTORE_Y_1
vst $vr10, Y, 0
.endm
.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req
PTR_SRLI J, N, 1
beqz J, .L_\XW\()_N_1
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M16
.L_\XW\()_N_L2:
ZLOAD_\X_2
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 1
beqz I, .L_\XW\()_M_1
.align 5
.L_\XW\()_M_L2:
ZLOAD_\Y_2
ZGEMV_N_2x2
ZSTORE_\Y_2
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 1
PTR_ADDI K, K, 4
bnez I, .L_\XW\()_M_L2
.L_\XW\()_M_1:
andi I, M, 1
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
ZLOAD_\Y_1
ZGEMV_N_1x2
ZSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#endif
PTR_ALSL X, INC_X, X, 1
bnez J, .L_\XW\()_N_L2
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
.L_\XW\()_N_L1:
ZLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
ZLOAD_\Y_1
ZGEMV_N_1x1
ZSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
PTR_SUB K_LDA, LDA, M16
PTR_ADD PA0, PA0, K_LDA
PTR_ADD X, X, INC_X
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
// Init VALPHA
vpackev.d VALPHA, $vr1, $vr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA
#else
GADD , d, PA1, PA0, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0 // Obtain the offset address
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_X_4_GAP
@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvpermi.q X3, X3, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_Y_4
@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GLD xv, , X0, X, 0x00
GPERMI xv, q, X0, X0, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_1x1

View File

@ -0,0 +1,268 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M16 $r30
#define VALPHA $vr0
#define X0 $vr1
#define X1 $vr2
#define A0 $vr3
#define A1 $vr4
#define A2 $vr5
#define A3 $vr6
#define A4 $vr7
#define A5 $vr8
#define A6 $vr9
#define A7 $vr10
#define A8 $vr11
#define A9 $vr12
#define A10 $vr13
#define A11 $vr14
#define A12 $vr15
#define A13 $vr16
#define A14 $vr17
#define A15 $vr18
#define TP0 $vr19
#define TP1 $vr20
#define TP2 $vr21
#define TP3 $vr22
#define TP4 $vr23
#define TP5 $vr24
#define TP6 $vr25
#define TP7 $vr26
#define TMP0 $vr27
#define TMP1 $vr28
#define TMP2 $vr29
#define Y0 $vr3
#define Y1 $vr4
#define Y2 $vr5
#define Y3 $vr6
#define Y4 $vr7
#define Y5 $vr8
#define Y6 $vr9
#define Y7 $vr10
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
#define GXCONJ1 0
#define GCONJ1 0
#else
#define GXCONJ1 1
#define GCONJ1 0
#endif
#if !defined(XCONJ)
#define GXCONJ2 0
#define GCONJ2 0
#else
#define GXCONJ2 0
#define GCONJ2 1
#endif
.macro ZERO_Y2
GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1
.endm
.macro ZERO_Y1
GXOR v, v, TP0, TP0, TP0
.endm
.macro ZLOAD_X2
GLD v, , X0, X, 0x00, X1, X, 0x10
.endm
.macro ZLOAD_X2_GAP
vld X0, X, 0
vldx X1, X, INC_X
.endm
.macro ZGEMV_T_2x2
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_T_LSX XW:req, X2:req
PTR_SRLI J, N, 1
beqz J, .L_\XW\()_N_1
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M16
.L_\XW\()_N_L2:
ZERO_Y2
move X, X_ORG
PTR_SRLI I, M, 1
beqz I, .L_\XW\()_M_1
.align 5
.L_\XW\()_M_L2:
ZLOAD_\X2
ZGEMV_T_2x2
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 1
bnez I, .L_\XW\()_M_L2
.L_\XW\()_M_1:
// Accumulated
GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1
andi I, M, 1
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
#elif __loongarch_grlen == 32
GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10
#else
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
#endif
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
vld A8, Y, 0x00
vldx A9, Y, INC_Y
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#endif
vst $vr11, Y, 0x00
vstx $vr12, Y, INC_Y
PTR_ALSL Y, INC_Y, Y, 1
bnez J, .L_\XW\()_N_L2
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
PTR_SUB K_LDA, LDA, M16
.L_\XW\()_N_L1:
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
GLD v, , A0, PA0, 0x00, X0, X, 0x00
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x10
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
vld A0, Y, 0x00
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
vst $vr3, Y, 0x00
PTR_ADD PA0, PA0, K_LDA
PTR_ADD Y, Y, INC_Y
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
// Init VALPHA
vpackev.d VALPHA, $vr1, $vr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA
#else
GADD , d, PA1, PA0, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
ZGEMV_T_LSX GAP_0, X2
.L_GAP_1: /* if (incx != 1) */
ZGEMV_T_LSX GAP_1, X2_GAP
.L_END:
pop_if_used 17 + 8, 30
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -16,13 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMKERNEL = sgemm_kernel_power10.c
DTRMMKERNEL = dgemm_kernel_power10.c
ifeq ($(OSNAME), AIX)
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
else
CTRMMKERNEL = cgemm_kernel_power10.S
ZTRMMKERNEL = zgemm_kernel_power10.S
endif
CTRMMKERNEL = cgemm_kernel_power10.c
ZTRMMKERNEL = zgemm_kernel_power10.c
SGEMMKERNEL = sgemm_kernel_power10.c
SGEMMINCOPY = sgemm_ncopy_16_power.c
@ -64,11 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
ifeq ($(OSNAME), AIX)
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
else
CGEMMKERNEL = cgemm_kernel_power10.S
endif
CGEMMKERNEL = cgemm_kernel_power10.c
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
ifeq ($(OSNAME), AIX)
@ -83,11 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
ifeq ($(OSNAME), AIX)
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
else
ZGEMMKERNEL = zgemm_kernel_power10.S
endif
ZGEMMKERNEL = zgemm_kernel_power10.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,736 @@
/*********************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
#define SET_ACC_ZERO() \
__builtin_mma_xxsetaccz (&acc0); \
__builtin_mma_xxsetaccz (&acc1); \
__builtin_mma_xxsetaccz (&acc2); \
__builtin_mma_xxsetaccz (&acc3); \
__builtin_mma_xxsetaccz (&acc4); \
__builtin_mma_xxsetaccz (&acc5); \
__builtin_mma_xxsetaccz (&acc6); \
__builtin_mma_xxsetaccz (&acc7);
#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; }
#endif
#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
#endif
#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; }
#endif
#if (defined(RR) || defined(RC) || defined(CR) || defined(CC))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
#endif
#if defined(TRMMKERNEL)
#define A_OP =
#else
#define A_OP +=
#endif
#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
__builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \
__builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \
__builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
__builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
__builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
__builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
__builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
#define SAVE_ACC_COMPLEX_11 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
#define SAVE_ACC_COMPLEX_12 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
#define SAVE_ACC_COMPLEX_21_1 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
#define SAVE_ACC_COMPLEX_21_2 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \
COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \
COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define SAVE_ACC_COMPLEX_21_4 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \
COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \
COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \
COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \
COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \
COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \
COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
#define SAVE_ACC_COMPLEX_22_1 \
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
__builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \
__builtin_mma_disassemble_acc ((void *)result, ACC1); \
__builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
#define REFRESH_TEMP_BK(x, y) \
temp = k - off;
#elif defined(LEFT)
#define REFRESH_TEMP_BK(x, y) \
temp = off + x;
#else
#define REFRESH_TEMP_BK(x, y) \
temp = off + y;
#endif
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
#define REFRESH_POINTERS(x, y) \
BO = B; \
REFRESH_TEMP_BK(x, y)
#else
#define REFRESH_POINTERS(x, y) \
AO += off * (2*x); \
BO = B + off * (2*y); \
REFRESH_TEMP_BK(x, y)
#endif
#ifdef LEFT
#define REFRESH_OFF(x) \
off += x;
#else
#define REFRESH_OFF(x)
#endif
#ifdef LEFT
#define UPDATE_TEMP(x, y) \
temp -= x;
#else
#define UPDATE_TEMP(x, y) \
temp -= y;
#endif
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
#define REFRESH_TMP_AFTER_SAVE(x, y) \
temp = k - off; \
UPDATE_TEMP(x, y) \
AO += temp * (2*x); \
BO += temp * (2*y);
#else
#define REFRESH_TMP_AFTER_SAVE(x, y)
#endif
#define REFRESH_AFTER_SAVE(x,y) \
REFRESH_TMP_AFTER_SAVE(x, y) \
REFRESH_OFF(x)
/*************************************************************************************
* GEMM Kernel
*************************************************************************************/
int
#ifdef TRMMKERNEL
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
#else
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
#endif
{
BLASLONG i1, i, l, temp;
FLOAT *AO, *BO, *CO;
#if defined(TRMMKERNEL)
BLASLONG off;
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
v4sf_t result[32];
FLOAT *res, tr[16], ti[16];
res = (FLOAT *) result;
for (i1 = 0; i1 < (n >> 1); i1++) {
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<2;
for (i = 0; i < (m >> 3); i++) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
__builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
__builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
}
__builtin_mma_disassemble_acc ((void *)result, &acc0);
__builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
__builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
__builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
__builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
__builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
__builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
__builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
AO += temp << 4;
BO += temp << 2;
CO += 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 2)
#endif
}
if (m & 4) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
}
for (l = (temp & (~1)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
}
SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
AO += temp << 3;
BO += temp << 2;
CO += 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 2)
#endif
}
if (m & 2) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
}
SAVE_ACC_COMPLEX_22_1
AO += temp << 2;
BO += temp << 2;
CO += 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 2)
#endif
}
if (m & 1) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
}
SAVE_ACC_COMPLEX_12
AO += temp << 1;
BO += temp << 2;
CO += 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 2)
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2; // number of values in A
#endif
B += k << 2;
}
if (n & 1) {
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<1;
for (i = 0; i < (m >> 3); i++) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
__builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
__builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
}
for (l = (temp & (~1)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
}
SAVE_ACC_COMPLEX_21_4
AO += temp << 4;
BO += temp << 1;
CO += 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
if (m & 4) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
}
for (l = (temp & (~3)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
}
SAVE_ACC_COMPLEX_21_2
AO += temp << 3;
BO += temp << 1;
CO += 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
}
if (m & 2) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
}
SAVE_ACC_COMPLEX_21_1
AO += temp << 2;
BO += temp << 1;
CO += 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
if (m & 1) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
}
SAVE_ACC_COMPLEX_11
AO += temp << 1;
BO += temp << 1;
CO += 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1; // number of values in A
#endif
B += k << 1;
}
return 0;
}

View File

@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = zasum.c
SSUMKERNEL = ../arm/asum.c
DSUMKERNEL = dasum.c
CSUMKERNEL = ../arm/zasum.c
ZSUMKERNEL = zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = dsum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = daxpy.c

View File

@ -21,7 +21,16 @@ endif()
if (BUILD_COMPLEX16)
list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
endif()
message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID})
if (USE_GEMM3M)
if (BUILD_COMPLEX)
list (APPEND OpenBLAS_Tests cblat3_3m)
endif ()
if (BUILD_COMPLEX16)
list (APPEND OpenBLAS_Tests zblat3_3m)
endif ()
endif ()
foreach(test_bin ${OpenBLAS_Tests})
add_executable(${test_bin} ${test_bin}.f)
target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
@ -82,4 +91,10 @@ add_test(NAME "${float_type}blas2"
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
add_test(NAME "${float_type}blas3"
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
if (USE_GEMM3M)
if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z"))
add_test(NAME "${float_type}blas3_3m"
COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3_3m> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM)
endif()
endif()
endforeach()

View File

@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN)
override FFLAGS += -fno-tree-vectorize
endif
SUPPORT_GEMM3M = 0
ifeq ($(ARCH), x86)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), x86_64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), ia64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), MIPS)
SUPPORT_GEMM3M = 1
endif
ifeq ($(NOFORTRAN),1)
all ::
else
@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1)
D3=dblat3
endif
ifeq ($(BUILD_COMPLEX),1)
ifeq ($(SUPPORT_GEMM3M),1)
C3=cblat3 cblat3_3m
else
C3=cblat3
endif
endif
ifeq ($(BUILD_COMPLEX16),1)
ifeq ($(SUPPORT_GEMM3M),1)
Z3=zblat3 zblat3_3m
else
Z3=zblat3
endif
endif
level3: $(B3) $(S3) $(D3) $(C3) $(Z3)

View File

@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n,
srand_generate(data_cgemv_t.y_test, m * inc_y * 2);
// Copy vector y for reference funcs
for (int i = 0; i < m * inc_y * 2; i++) {
for (i = 0; i < m * inc_y * 2; i++) {
data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i];
}

View File

@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint
char trans = 'N';
// Symmetric band packed matrix for sbmv
float a[lda * n * 2];
float *a = (float*) malloc(lda * n * 2 * sizeof(float));
// Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test
srand_generate(data_csbmv.sp_matrix, n * (n + 1));
@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint
// Find the differences between output vector caculated by csbmv and cgemv
for (i = 0; i < n * inc_c * 2; i++)
data_csbmv.c_test[i] -= data_csbmv.c_verify[i];
free(a);
// Find the norm of differences
return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c);
}

View File

@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){
CTEST(idamin, min_idx_in_vec_tail_inc_1){
blasint i;
blasint N = ELEMENTS, inc = 1;
double x[ELEMENTS * inc];
double *x = (double*)malloc(ELEMENTS * inc * sizeof(double));
for (i = 0; i < N * inc; i ++) {
x[i] = i + 1000;
}
x[(N - 1) * inc] = 0.0f;
blasint index = BLASFUNC(idamin)(&N, x, &inc);
free(x);
ASSERT_EQUAL(N, index);
}
@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){
CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){
blasint i;
blasint N = ELEMENTS, inc = 1;
double x[ELEMENTS * inc];
double *x = (double*) malloc(ELEMENTS * inc * sizeof(double));
for (i = 0; i < N * inc; i ++) {
x[i] = i + 1000;
}
x[(N - 1) * inc] = 0.0;
blasint index = cblas_idamin(N, x, inc);
free(x);
ASSERT_EQUAL(N - 1, index);
}
#endif

View File

@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){
CTEST(isamin, min_idx_in_vec_tail_inc_1){
blasint i;
blasint N = ELEMENTS, inc = 1;
float x[ELEMENTS * inc];
float *x = (float*) malloc(ELEMENTS * inc * sizeof(float));
for (i = 0; i < N * inc; i ++) {
x[i] = i + 1000;
}
x[(N - 1) * inc] = 0.0f;
blasint index = BLASFUNC(isamin)(&N, x, &inc);
free(x);
ASSERT_EQUAL(N, index);
}
@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){
CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){
blasint i;
blasint N = ELEMENTS, inc = 1;
float x[ELEMENTS * inc];
float *x = (float*)malloc(ELEMENTS * inc * sizeof(float));
for (i = 0; i < N * inc; i ++) {
x[i] = i + 1000;
}
x[(N - 1) * inc] = 0.0f;
blasint index = cblas_isamin(N, x, inc);
free(x);
ASSERT_EQUAL(N - 1, index);
}
#endif

View File

@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n
drand_generate(data_zgemv_t.y_test, m * inc_y * 2);
// Copy vector y for reference funcs
for (int i = 0; i < m * inc_y * 2; i++)
for (i = 0; i < m * inc_y * 2; i++)
{
data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i];
}

View File

@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin
char trans = 'N';
// Symmetric band packed matrix for sbmv
double a[lda * n * 2];
double *a = (double*) malloc(lda * n * 2 * sizeof(double));
// Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test
drand_generate(data_zsbmv.sp_matrix, n * (n + 1));
@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin
BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda,
data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c);
free(a);
// Find the differences between output vector caculated by zsbmv and zgemv
for (i = 0; i < n * inc_c * 2; i++)
data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i];