Merge remote-tracking branch 'origin/develop' into power10Copies

This commit is contained in:
Chip-Kerchner 2023-12-12 09:32:49 -06:00
commit 93747fb377
177 changed files with 53534 additions and 1370 deletions

View File

@ -29,7 +29,7 @@ task:
- mkdir build
- cd build
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
- make
- make -j 4
task:
name: AppleM1/GCC/MAKE/OPENMP

View File

@ -249,21 +249,22 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
endif()
endif()
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
# Seems that this hack doesn't required since macOS 11 Big Sur
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
if (NOT NOFORTRAN)
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
else ()
set (CMAKE_C_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
endif ()
endif()
@ -541,7 +542,7 @@ if(NOT NO_LAPACKE)
ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
# Install pkg-config files

View File

@ -216,3 +216,6 @@ In chronological order:
* Pablo Romero <https://github.com/pablorcum>
* [2022-08] Fix building from sources for QNX
* Mark Seminatore <https://github.com/mseminatore>
* [2023-11-09] Improve Windows threading performance scaling

View File

@ -11,7 +11,7 @@
operation is finished.
2. Simlar problem may happen under virtual machine. If supervisor
2. Similar problem may happen under virtual machine. If supervisor
allocates different cores for each scheduling, BLAS performnace
will be bad. This is because BLAS also utilizes all cache,
unexpected re-schedule for different core may result of heavy

View File

@ -11,7 +11,19 @@ endif
ifeq ($(CORE), POWER10)
ifneq ($(C_COMPILER), PGI)
ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGTEQ10), 1)
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
else ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
else
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
endif
else
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
endif
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
else

View File

@ -407,6 +407,7 @@ XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/vers
endif
ifeq (x$(XCVER), x 15)
CCOMMON_OPT += -Wl,-ld_classic
FCOMMON_OPT += -Wl,-ld_classic
endif
endif

View File

@ -202,7 +202,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.

View File

@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
long long muls = n*(n+1)/2.0;
long long adds = (n - 1.0)*n/2.0;
fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
fprintf(stderr, "%10d : %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
if(a != NULL){
free(a);
}

View File

@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then
tmpd="$(mktemp -d)"
tmplsx="$tmpd/lsx.c"
codelsx='"vadd.b $vr0, $vr0, $vr0"'
lsx_flags='-march=loongarch64 -mlsx'
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
lsx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
args="$lsx_flags -o $tmplsx.o $tmplsx"
{
@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then
tmplasx="$tmpd/lasx.c"
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
lasx_flags='-march=loongarch64 -mlasx'
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
lasx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
args="$lasx_flags -o $tmplasx.o $tmplasx"
{

View File

@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) {
} else {
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
$lsx_flags = "-march=loongarch64 -mlsx";
print $tmplsx "#include <lsxintrin.h>\n\n";
$lsx_flags = "-march=loongarch64";
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) {
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
$lasx_flags = "-march=loongarch64 -mlasx";
print $tmplasx "#include <lasxintrin.h>\n\n";
$lasx_flags = "-march=loongarch64";
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
$args = "$lasx_flags -o $tmplasx.o $tmplasx";

View File

@ -52,7 +52,7 @@ set(SLASRC
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
sgetrf2.f sgetri.f
sggbak.f sggbal.f
@ -67,7 +67,7 @@ set(SLASRC
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
slansy.f slantb.f slantp.f slantr.f slanv2.f
slapll.f slapmt.f
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
@ -139,7 +139,7 @@ set(CLASRC
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
cgehd2.f cgehrd.f cgelq2.f cgelqf.f
cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f
cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
@ -173,7 +173,7 @@ set(CLASRC
clanhb.f clanhe.f
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
claqz0.f claqz1.f claqz2.f claqz3.f
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
@ -243,7 +243,7 @@ set(DLASRC
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
dgetrf2.f dgetri.f
dggbak.f dggbal.f
@ -258,7 +258,7 @@ set(DLASRC
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
dlapll.f dlapmt.f
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
@ -331,7 +331,7 @@ set(ZLASRC
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
zgehd2.f zgehrd.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f
zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
@ -367,7 +367,7 @@ set(ZLASRC
zlanhe.f
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
@ -557,7 +557,7 @@ set(SLASRC
sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c
sgehd2.c sgehrd.c sgelq2.c sgelqf.c
sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
sgetrf2.c sgetri.c
sggbak.c sggbal.c
@ -571,7 +571,7 @@ set(SLASRC
slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
slansy.c slantb.c slantp.c slantr.c slanv2.c
slapll.c slapmt.c
slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
@ -643,7 +643,7 @@ set(CLASRC
cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c
cgehd2.c cgehrd.c cgelq2.c cgelqf.c
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c
cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
@ -677,7 +677,7 @@ set(CLASRC
clanhb.c clanhe.c
clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c
claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
@ -746,7 +746,7 @@ set(DLASRC
dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c
dgehd2.c dgehrd.c dgelq2.c dgelqf.c
dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
dgetrf2.c dgetri.c
dggbak.c dggbal.c
@ -760,7 +760,7 @@ set(DLASRC
dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
dlapll.c dlapmt.c
dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
@ -833,7 +833,7 @@ set(ZLASRC
zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c
zgehd2.c zgehrd.c zgelq2.c zgelqf.c
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c
zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
@ -868,7 +868,7 @@ set(ZLASRC
zlanhe.c
zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c

View File

@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64)
endif()
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
set(POWER 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
@ -46,7 +46,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
set(LOONGARCH64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
set(RISCV64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*"))
if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
@ -109,7 +109,7 @@ else()
endif ()
if (NOT BINARY)
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
set(BINARY 64)
else ()
set(BINARY 32)

View File

@ -124,7 +124,17 @@ static inline int WhereAmI(void){
#define CMPLE fcmp.cle.d
#define CMPLT fcmp.clt.d
#define NEG fneg.d
#define XVFSUB xvfsub.d
#define XVFADD xvfadd.d
#define XVFMADD xvfmadd.d
#define VFSUB vfsub.d
#define VFADD vfadd.d
#define VFMADD vfmadd.d
#else
#define LD fld.s
#define ST fst.s
#define MADD fmadd.s
@ -142,6 +152,15 @@ static inline int WhereAmI(void){
#define CMPLE fcmp.cle.s
#define CMPLT fcmp.clt.s
#define NEG fneg.s
#define XVFSUB xvfsub.s
#define XVFADD xvfadd.s
#define XVFMADD xvfmadd.s
#define VFSUB vfsub.s
#define VFADD vfadd.s
#define VFMADD vfmadd.s
#endif /* defined(DOUBLE) */
#if defined(__64BIT__) && defined(USE64BITINT)

View File

@ -111,8 +111,9 @@ typedef struct blas_queue {
struct blas_queue *next;
#if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
CRITICAL_SECTION lock;
HANDLE finish;
// CRITICAL_SECTION lock;
// HANDLE finish;
volatile int finished;
#else
pthread_mutex_t lock;
pthread_cond_t finished;

View File

@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_LOONGSON3R5 1
#define CPU_LOONGSON2K1000 2
#define LA_HWCAP_LSX (1<<4)
#define LA_HWCAP_LASX (1<<5)
#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)
static char *cpuname[] = {
"LOONGSONGENERIC",
@ -64,11 +64,11 @@ static char *cpuname_lower[] = {
int detect(void) {
#ifdef __linux
int flag = (int)getauxval(AT_HWCAP);
int hwcap = (int)getauxval(AT_HWCAP);
if (flag & LA_HWCAP_LASX)
if (hwcap & LA_HWCAP_LASX)
return CPU_LOONGSON3R5;
else if (flag & LA_HWCAP_LSX)
else if (hwcap & LA_HWCAP_LSX)
return CPU_LOONGSON2K1000;
else
return CPU_GENERIC;
@ -94,7 +94,9 @@ void get_subdirname(void) {
}
void get_cpuconfig(void) {
uint32_t hwcaps = 0;
int d = detect();
switch (d) {
case CPU_LOONGSON3R5:
printf("#define LOONGSON3R5\n");
@ -129,6 +131,10 @@ void get_cpuconfig(void) {
printf("#define L2_ASSOCIATIVE 16\n");
break;
}
hwcaps = (uint32_t)getauxval( AT_HWCAP );
if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n");
if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n");
}
void get_libname(void){

View File

@ -160,6 +160,7 @@ int detect(void){
infoCount = HOST_BASIC_INFO_COUNT;
host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970;

View File

@ -51,15 +51,10 @@
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
HANDLE killed;
blas_queue_t *queue; /* Parameter Pointer */
int shutdown; /* server shutdown flag */
} blas_pool_t;
static blas_queue_t *work_queue = NULL;
static HANDLE kickoff_event = NULL;
static CRITICAL_SECTION queue_lock;
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
@ -67,11 +62,19 @@ int blas_server_avail = 0;
/* Local Variables */
static BLASULONG server_lock = 0;
static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER];
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
#if defined (__GNUC__) && (__GNUC__ < 6)
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
#else
#if defined(_WIN64)
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
#else
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
#endif
#endif
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
@ -202,14 +205,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
static DWORD WINAPI blas_thread_server(void *arg){
/* Thread identifier */
#ifdef SMP_DEBUG
BLASLONG cpu = (BLASLONG)arg;
#endif
void *buffer, *sa, *sb;
blas_queue_t *queue;
DWORD action;
HANDLE handles[] = {pool.filled, pool.killed};
/* Each server needs each buffer */
buffer = blas_memory_alloc(2);
@ -225,29 +224,44 @@ static DWORD WINAPI blas_thread_server(void *arg){
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
#endif
// event raised when work is added to the queue
WaitForSingleObject(kickoff_event, INFINITE);
do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
if (action == WAIT_OBJECT_0 + 1) break;
if (cpu > thread_target - 2)
{
//printf("thread [%d] exiting.\n", cpu);
break; // excess thread, so worker thread exits
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
#endif
EnterCriticalSection(&pool.lock);
#if 1
EnterCriticalSection(&queue_lock);
queue = pool.queue;
if (queue) pool.queue = queue->next;
queue = work_queue;
if (queue)
work_queue = work_queue->next;
LeaveCriticalSection(&pool.lock);
LeaveCriticalSection(&queue_lock);
#else
volatile blas_queue_t* queue_next;
INT_PTR prev_value;
do {
queue = (volatile blas_queue_t*)work_queue;
if (!queue)
break;
queue_next = (volatile blas_queue_t*)queue->next;
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
} while (prev_value != queue);
#endif
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
if (pool.queue) SetEvent(pool.filled);
sa = queue -> sa;
sb = queue -> sb;
@ -332,13 +346,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
#endif
EnterCriticalSection(&queue->lock);
queue->finished = 1;
queue -> status = BLAS_STATUS_FINISHED;
LeaveCriticalSection(&queue->lock);
SetEvent(queue->finish);
}
/* Shutdown procedure */
@ -366,15 +375,16 @@ int blas_thread_init(void){
#endif
if (!blas_server_avail){
// create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
thread_target = blas_cpu_number;
pool.shutdown = 0;
pool.queue = NULL;
InitializeCriticalSection(&queue_lock);
for(i = 0; i < blas_cpu_number - 1; i++){
//printf("thread_init: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
@ -409,8 +419,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
current = queue;
while (current) {
InitializeCriticalSection(&current -> lock);
current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
current -> position = pos;
#ifdef CONSISTENT_FPCSR
@ -418,23 +426,32 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
#endif
current->finished = 0;
current = current -> next;
pos ++;
}
EnterCriticalSection(&pool.lock);
EnterCriticalSection(&queue_lock);
if (pool.queue) {
current = pool.queue;
while (current -> next) current = current -> next;
current -> next = queue;
} else {
pool.queue = queue;
if (!work_queue)
{
work_queue = queue;
}
else
{
blas_queue_t *next_item = work_queue;
// find the end of the work queue
while (next_item)
next_item = next_item->next;
// add new work to the end
next_item = queue;
}
LeaveCriticalSection(&pool.lock);
LeaveCriticalSection(&queue_lock);
SetEvent(pool.filled);
SetEvent(kickoff_event);
return 0;
}
@ -449,19 +466,24 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
#ifdef SMP_DEBUG
fprintf(STDERR, "Waiting Queue ..\n");
#endif
while (!queue->finished)
YIELDING;
WaitForSingleObject(queue->finish, INFINITE);
CloseHandle(queue->finish);
DeleteCriticalSection(&queue -> lock);
queue = queue -> next;
num --;
queue = queue->next;
num--;
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Completely Done.\n\n");
#endif
// if work was added to the queue after this batch we can't sleep the worker threads
// by resetting the event
EnterCriticalSection(&queue_lock);
if (work_queue == NULL)
ResetEvent(kickoff_event);
LeaveCriticalSection(&queue_lock);
return 0;
}
@ -512,8 +534,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
if (blas_server_avail){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
@ -528,9 +548,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
CloseHandle(blas_threads[i]);
}
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0;
}
@ -552,23 +569,48 @@ void goto_set_num_threads(int num_threads)
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (blas_server_avail && num_threads < blas_num_threads) {
LOCK_COMMAND(&server_lock);
thread_target = num_threads;
SetEvent(kickoff_event);
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
WaitForSingleObject(blas_threads[i], INFINITE);
//printf("set_num_threads: thread [%d] has quit.\n", i);
CloseHandle(blas_threads[i]);
}
blas_num_threads = num_threads;
ResetEvent(kickoff_event);
UNLOCK_COMMAND(&server_lock);
}
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
thread_target = num_threads;
//increased_threads = 1;
if (!blas_server_avail){
// create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
InitializeCriticalSection(&queue_lock);
pool.shutdown = 0;
pool.queue = NULL;
blas_server_avail = 1;
}
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
//printf("set_num_threads: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,

View File

@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <sys/auxv.h>
#include "common.h"
extern gotoblas_t gotoblas_LOONGSON3R5;
@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) {
return NULL;
}
#define LASX_MASK 1<<7
#define LSX_MASK 1<<6
#define LOONGARCH_CFG2 0x02
#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)
static gotoblas_t *get_coretype(void) {
int ret = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(ret)
: "r"(LOONGARCH_CFG2)
);
int hwcap = (int)getauxval(AT_HWCAP);
if (ret & LASX_MASK)
if (hwcap & LA_HWCAP_LASX)
return &gotoblas_LOONGSON3R5;
else if (ret & LSX_MASK)
else if (hwcap & LA_HWCAP_LSX)
return &gotoblas_LOONGSON2K1000;
else
return &gotoblas_LOONGSONGENERIC;

View File

@ -66,8 +66,7 @@ static int cpuid(void)
#endif
return CPU_UNKNOWN;
}
#else
#if defined(C_PGI) || defined(__clang__)
#elif defined(C_PGI) || defined(__clang__)
/*
* NV HPC compilers do not yet implement __builtin_cpu_is().
* Fake a version here for use in the CPU detection code below.
@ -196,13 +195,21 @@ static int cpuid(void)
cpu_type = pvrPOWER[i].cpu_type;
return (int)(cpu_type);
}
#endif /* C_PGI */
#elif !defined(__BUILTIN_CPU_SUPPORTS__)
static int cpuid(void)
{
return CPU_UNKNOWN;
}
#endif /* _AIX */
#ifndef __BUILTIN_CPU_SUPPORTS__
#include <string.h>
#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is))
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#if defined(_AIX) || !__has_builtin(__builtin_cpu_is)
static int __builtin_cpu_is(const char *arg)
{
static int ipinfo = -1;
@ -227,7 +234,7 @@ static int __builtin_cpu_is(const char *arg)
}
#endif
#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports))
#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports)
static int __builtin_cpu_supports(const char *arg)
{
return 0;

View File

@ -0,0 +1,58 @@
ifndef NO_LSX
SDOTKERNEL = dot_lsx.S
DSDOTKERNEL = dot_lsx.S
DDOTKERNEL = dot_lsx.S
SSCALKERNEL = sscal_lsx.S
DSCALKERNEL = dscal_lsx.S
SAMAXKERNEL = samax_lsx.S
DAMAXKERNEL = damax_lsx.S
SAMINKERNEL = samin_lsx.S
DAMINKERNEL = damin_lsx.S
SMAXKERNEL = smax_lsx.S
DMAXKERNEL = dmax_lsx.S
SMINKERNEL = smin_lsx.S
DMINKERNEL = dmin_lsx.S
ISMAXKERNEL = ismax_lsx.S
IDMAXKERNEL = idmax_lsx.S
ISMINKERNEL = ismin_lsx.S
IDMINKERNEL = idmin_lsx.S
ISAMAXKERNEL = isamax_lsx.S
IDAMAXKERNEL = idamax_lsx.S
ISAMINKERNEL = isamin_lsx.S
IDAMINKERNEL = idamin_lsx.S
SCOPYKERNEL = scopy_lsx.S
DCOPYKERNEL = dcopy_lsx.S
SSWAPKERNEL = sswap_lsx.S
DSWAPKERNEL = dswap_lsx.S
SAXPYKERNEL = saxpy_lsx.S
DAXPYKERNEL = daxpy_lsx.S
SAXPBYKERNEL = saxpby_lsx.S
DAXPBYKERNEL = daxpby_lsx.S
SSUMKERNEL = ssum_lsx.S
DSUMKERNEL = dsum_lsx.S
SASUMKERNEL = sasum_lsx.S
DASUMKERNEL = dasum_lsx.S
SROTKERNEL = srot_lsx.S
DROTKERNEL = drot_lsx.S
SNRM2KERNEL = snrm2_lsx.S
DNRM2KERNEL = dnrm2_lsx.S
endif

View File

@ -1,4 +1,60 @@
ifndef NO_LASX
SDOTKERNEL = dot_lasx.S
DSDOTKERNEL = dot_lasx.S
DDOTKERNEL = dot_lasx.S
SSCALKERNEL = sscal_lasx.S
DSCALKERNEL = dscal_lasx.S
SAMAXKERNEL = samax_lasx.S
DAMAXKERNEL = damax_lasx.S
SAMINKERNEL = samin_lasx.S
DAMINKERNEL = damin_lasx.S
SMAXKERNEL = smax_lasx.S
DMAXKERNEL = dmax_lasx.S
SMINKERNEL = smin_lasx.S
DMINKERNEL = dmin_lasx.S
ISMAXKERNEL = ismax_lasx.S
IDMAXKERNEL = idmax_lasx.S
ISMINKERNEL = ismin_lasx.S
IDMINKERNEL = idmin_lasx.S
ISAMAXKERNEL = isamax_lasx.S
IDAMAXKERNEL = idamax_lasx.S
ISAMINKERNEL = isamin_lasx.S
IDAMINKERNEL = idamin_lasx.S
SCOPYKERNEL = scopy_lasx.S
DCOPYKERNEL = dcopy_lasx.S
SSWAPKERNEL = sswap_lasx.S
DSWAPKERNEL = dswap_lasx.S
SAXPYKERNEL = saxpy_lasx.S
DAXPYKERNEL = daxpy_lasx.S
SAXPBYKERNEL = saxpby_lasx.S
DAXPBYKERNEL = daxpby_lasx.S
SSUMKERNEL = ssum_lasx.S
DSUMKERNEL = dsum_lasx.S
SASUMKERNEL = sasum_lasx.S
DASUMKERNEL = dasum_lasx.S
SROTKERNEL = srot_lasx.S
DROTKERNEL = drot_lasx.S
SNRM2KERNEL = snrm2_lasx.S
DNRM2KERNEL = dnrm2_lasx.S
DGEMMKERNEL = dgemm_kernel_16x4.S
DGEMMINCOPY = dgemm_ncopy_16.S
DGEMMITCOPY = dgemm_tcopy_16.S

View File

@ -0,0 +1,183 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define x5 $xr13
#define x6 $xr14
#define x7 $xr15
#define x8 $xr16
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr18
#define VM3 $xr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
addi.d I, I, -1
xvfmaxa.d VM1, VX1, VX0
addi.d X, X, 8 * SIZE
xvfmaxa.d VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfmaxa.d VM2, x3, x4
xvfmaxa.d VM0, VM1, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmaxa.d m0, VX0, VX1 //patial repeat read
xvpickve.d x1, m0, 0
xvpickve.d x2, m0, 1
xvpickve.d x3, m0, 2
xvpickve.d x4, m0, 3
xvfmaxa.d VM1, x1, x2
xvfmaxa.d m0, x3, x4
xvfmaxa.d m0, m0, VM1
xvfmaxa.d VM0, m0, VM0
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
addi.d I, I, -1
xvfmaxa.d VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the maxa value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
addi.d I, I, -1
xvfmaxa.d VM1, VX1, VX0
xvfmaxa.d VM0, VM1, VM0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfmaxa.d VM2, x3, x4
xvfmaxa.d VM0, VM1, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
addi.d I, I, -1
xvfmaxa.d VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,145 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define x1 $vr9
#define x2 $vr10
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
#define VM2 $vr18
#define VM3 $vr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
addi.d I, I, -1
vfmaxa.d VM1, VX1, VX0
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfmaxa.d VM2, VX1, VX0
vfmaxa.d VM3, VM1, VM2
addi.d X, X, 8 * SIZE
vfmaxa.d VM0, VM0, VM3
blt $r0, I, .L10
.align 3
.L11:
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
vfmaxa.d VM0, x1, x2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
.align 3
.L13:
vld x1, X, 0
addi.d I, I, -1
vfmaxa.d VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L13
.align 3
.L14:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the maxa value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmaxa.d VM1, VX0, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmaxa.d VM2, VX0, VX1
vfmaxa.d VM3, VM1, VM2
vfmaxa.d VM0, VM0, VM3
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
vfmaxa.d VM0, x1, x2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
addi.d I, I, -1
vfmaxa.d VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,178 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvld VX1, X, 4 * SIZE
xvfmina.d VM1, VX1, VX0
addi.d X, X, 8 * SIZE
xvfmina.d VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmina.d VM1, x1, x2
xvfmina.d VM2, x3, x4
xvfmina.d VM0, VM1, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmina.d m0, VX0, VX1 //patial repeat read
xvpickve.d x1, m0, 0
xvpickve.d x2, m0, 1
xvpickve.d x3, m0, 2
xvpickve.d x4, m0, 3
xvfmina.d VM1, x1, x2
xvfmina.d m0, x3, x4
xvfmina.d m0, m0, VM1
xvfmina.d VM0, m0, VM0
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
addi.d I, I, -1
xvfmina.d VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the mina value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
addi.d I, I, -1
xvfmina.d VM1, VX1, VX0
xvfmina.d VM0, VM1, VM0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmina.d VM1, x1, x2
xvfmina.d VM2, x3, x4
xvfmina.d VM0, VM1, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
addi.d I, I, -1
xvfmina.d VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,145 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define x1 $vr9
#define x2 $vr10
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
#define VM2 $vr18
#define VM3 $vr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
addi.d I, I, -1
vfmina.d VM1, VX1, VX0
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfmina.d VM2, VX1, VX0
vfmina.d VM3, VM1, VM2
addi.d X, X, 8 * SIZE
vfmina.d VM0, VM0, VM3
blt $r0, I, .L10
.align 3
.L11:
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
vfmina.d VM0, x1, x2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
.align 3
.L13:
vld x1, X, 0
addi.d I, I, -1
vfmina.d VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L13
.align 3
.L14:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the mina value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmina.d VM1, VX0, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
addi.d I, I, -1
vfmina.d VM2, VX0, VX1
vfmina.d VM3, VM1, VM2
vfmina.d VM0, VM0, VM3
blt $r0, I, .L21
.align 3
.L22:
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
vfmina.d VM0, x1, x2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
vfmina.d VM0, VM0, x1
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.d $f22, $f22
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,148 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
#define VT0 $xr23
#define VT1 $xr22
#define res1 $xr16
#define res2 $xr17
#define res0 $xr18
#define neg1 $xr19
PROLOGUE
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
xvxor.v res0, res0, res0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d t1, -1
xvreplgr2vr.d neg1, t1
xvffint.d.l neg1, neg1
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvfmul.d VX2, neg1, VX0
xvfmul.d VX3, neg1, VX1
xvfcmp.clt.d VT0, VX0, res0
xvfcmp.clt.d VT1, VX1, res0
xvbitsel.v VX0, VX0, VX2, VT0
xvbitsel.v VX1, VX1, VX3, VT1
xvfadd.d res2, VX0, VX1
xvfadd.d res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.d $f12, X, 0 * SIZE
fabs.d $f12, $f12
fadd.d $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvfmul.d VX2, neg1, VX0
xvfmul.d VX3, neg1, VX1
xvfcmp.clt.d VT0, VX0, res0
xvfcmp.clt.d VT1, VX1, res0
xvbitsel.v VX0, VX0, VX2, VT0
xvbitsel.v VX1, VX1, VX3, VT1
xvfadd.d res2, VX0, VX1
xvfadd.d res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.d $f12, X, 0 * SIZE
fabs.d $f12, $f12
fadd.d $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,158 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
#define VT0 $vr23
#define VT1 $vr22
#define res1 $vr16
#define res2 $vr17
#define res0 $vr18
#define neg1 $vr19
PROLOGUE
vxor.v res1, res1, res1
vxor.v res2, res2, res2
vxor.v res0, res0, res0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d t1, -1
vreplgr2vr.d neg1, t1
vffint.d.l neg1, neg1
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vfmul.d VX2, neg1, VX0
vfmul.d VX3, neg1, VX1
vfcmp.clt.d VT0, VX0, res0
vfcmp.clt.d VT1, VX1, res0
vbitsel.v VX0, VX0, VX2, VT0
vbitsel.v VX1, VX1, VX3, VT1
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfmul.d VX2, neg1, VX0
vfmul.d VX3, neg1, VX1
vfcmp.clt.d VT0, VX0, res0
vfcmp.clt.d VT1, VX1, res0
vbitsel.v VX0, VX0, VX2, VT0
vbitsel.v VX1, VX1, VX3, VT1
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.d $f12, X, 0 * SIZE
fabs.d $f12, $f12
fadd.d $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
add.d X, X, INCX
vfmul.d VX2, neg1, VX0
vfmul.d VX3, neg1, VX1
vfcmp.clt.d VT0, VX0, res0
vfcmp.clt.d VT1, VX1, res0
vbitsel.v VX0, VX0, VX2, VT0
vbitsel.v VX1, VX1, VX3, VT1
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t3, 0
vinsgr2vr.d VX0, t4, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vfmul.d VX2, neg1, VX0
vfmul.d VX3, neg1, VX1
vfcmp.clt.d VT0, VX0, res0
vfcmp.clt.d VT1, VX1, res0
vbitsel.v VX0, VX0, VX2, VT0
vbitsel.v VX1, VX1, VX3, VT1
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.d $f12, X, 0 * SIZE
fabs.d $f12, $f12
fadd.d $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,629 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r5
#define INCX $r6
#define BETA $f1
#define Y $r7
#define INCY $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define a2 $f13
#define VX0 $xr8
#define VX1 $xr20
#define VX2 $xr21
#define VX3 $xr22
#define VXA $xr23
#define VXB $xr9
#define VXZ $xr19
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.d.l a1, a1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.d t1, ALPHA
xvreplgr2vr.d VXA, t1
movfr2gr.d t2, BETA
xvreplgr2vr.d VXB, t2
movfr2gr.d t3, a1
xvreplgr2vr.d VXZ, t3
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L110
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
b .L111 // ALPHA!=0 BETA!=0
.align 3
.L110:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L114 // ALPHA==0 BETA==0
b .L113 // ALPHA==0 BETA!=0
.align 3
.L111: // ALPHA!=0 BETA!=0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX3, Y, 4 * SIZE
xvfmul.d VX0, VX0, VXA
xvfmul.d VX1, VX1, VXA
xvfmadd.d VX2, VX2, VXB, VX0
xvfmadd.d VX3, VX3, VXB, VX1
addi.d I, I, -1
xvst VX2, Y, 0 * SIZE
xvst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
b .L997
.align 3
.L112: // ALPHA!=0 BETA==0
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvfmul.d VX0, VX0, VXA
xvfmul.d VX1, VX1, VXA
xvst VX0, Y, 0 * SIZE
xvst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
.L113: // ALPHA==0 BETA!=0
xvld VX2, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
xvfmul.d VX2, VX2, VXB
xvfmul.d VX3, VX3, VXB
xvst VX2, Y, 0 * SIZE
xvst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
.L114: // ALPHA==0 BETA==0
xvst VXZ, Y, 0 * SIZE
xvst VXZ, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L120
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
b .L121 // ALPHA!=0 BETA!=0
.align 3
.L120:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L124 // ALPHA==0 BETA==0
b .L123 // ALPHA==0 BETA!=0
.align 3
.L121: // ALPHA!=0 BETA!=0
xvld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VX0, VX0, VXA
xvld VX1, X, 4 * SIZE
xvfmadd.d VX2, VX2, VXB, VX0
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
xvstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 3
add.d YY, YY, INCY
xvfmul.d VX1, VX1, VXA
xvfmadd.d VX3, VX3, VXB, VX1
addi.d I, I, -1
xvstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
b .L997
.align 3
.L122: // ALPHA!=0 BETA==0
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvfmul.d VX0, VX0, VXA
xvfmul.d VX1, VX1, VXA
xvstelm.d VX0, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX0, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX0, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX0, YY, 0, 3
add.d YY, YY, INCY
xvstelm.d VX1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L122
b .L997
.align 3
.L123: // ALPHA==0 BETA!=0
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VX2, VX2, VXB
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
xvstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 3
add.d YY, YY, INCY
xvfmul.d VX3, VX3, VXB
addi.d I, I, -1
xvstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L123
b .L997
.align 3
.L124: // ALPHA==0 BETA==0
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L124
b .L997
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L210
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
b .L211 // ALPHA!=0 BETA!=0
.align 3
.L210:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L214 // ALPHA==0 BETA==0
b .L213 // ALPHA==0 BETA!=0
.align 3
.L211: // ALPHA!=0 BETA!=0
xvld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvfmul.d VX0, VXA, VX0
xvfmadd.d VX2, VX2, VXB, VX0
xvld VX3, Y, 4 * SIZE
xvst VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvfmul.d VX1, VX1, VXA
xvfmadd.d VX3, VX3, VXB, VX1
addi.d I, I, -1
xvst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
b .L997
.align 3
.L212: // ALPHA!=0 BETA==0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvfmul.d VX0, VXA, VX0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvst VX0, Y, 0 * SIZE
xvfmul.d VX1, VX1, VXA
addi.d I, I, -1
xvst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L212
b .L997
.align 3
.L213: // ALPHA==0 BETA!=0
xvld VX2, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
xvfmul.d VX2, VX2, VXB
xvfmul.d VX3, VX3, VXB
addi.d I, I, -1
xvst VX2, Y, 0 * SIZE
xvst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L213
b .L997
.align 3
.L214: // ALPHA==0 BETA==0
xvst VXZ, Y, 0 * SIZE
xvst VXZ, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L214
b .L997
.align 3
.L22:
bge $r0, I, .L997
move YY, Y
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L220
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
b .L221 // ALPHA!=0 BETA!=0
.align 3
.L220:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L224 // ALPHA==0 BETA==0
b .L223 // ALPHA==0 BETA!=0
.align 3
.L221: // ALPHA!=0 BETA!=0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VX0, VX0, VXA
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvfmadd.d VX2, VX2, VXB, VX0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmul.d VX1, VX1, VXA
xvfmadd.d VX3, VX3, VXB, VX1
addi.d I, I, -1
xvstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L221
b .L997
.align 3
.L222: // ALPHA!=0 BETA==0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvfmul.d VX0, VX0, VXA
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvstelm.d VX0, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX0, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX0, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX0, YY, 0, 3
add.d YY, YY, INCY
xvfmul.d VX1, VX1, VXA
addi.d I, I, -1
xvstelm.d VX1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX1, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L222
b .L997
.align 3
.L223: // ALPHA==0 BETA!=0
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VX2, VX2, VXB
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
xvstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 3
add.d YY, YY, INCY
xvfmul.d VX3, VX3, VXB
addi.d I, I, -1
xvstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L223
b .L997
.align 3
.L224: // ALPHA==0 BETA==0
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.d $f12, X, 0 * SIZE
fld.d $f13, Y, 0 * SIZE
addi.d I, I, -1
fmul.d $f12, $f12, ALPHA
fmadd.d $f13, $f13, BETA, $f12
fst.d $f13, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,693 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r5
#define INCX $r6
#define BETA $f1
#define Y $r7
#define INCY $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define a2 $f13
#define VX0 $vr8
#define VX1 $vr20
#define VX2 $vr21
#define VX3 $vr22
#define VXA $vr23
#define VXB $vr9
#define VXZ $vr19
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.d.l a1, a1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.d t1, ALPHA
vreplgr2vr.d VXA, t1
movfr2gr.d t2, BETA
vreplgr2vr.d VXB, t2
movfr2gr.d t3, a1
vreplgr2vr.d VXZ, t3
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L110
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
b .L111 // ALPHA!=0 BETA!=0
.align 3
.L110:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L114 // ALPHA==0 BETA==0
b .L113 // ALPHA==0 BETA!=0
.align 3
.L111: // ALPHA!=0 BETA!=0
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 2 * SIZE
vld VX3, Y, 2 * SIZE
vfmul.d VX0, VX0, VXA
vfmul.d VX1, VX1, VXA
vfmadd.d VX2, VX2, VXB, VX0
vfmadd.d VX3, VX3, VXB, VX1
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX2, Y, 4 * SIZE
vld VX1, X, 6 * SIZE
vld VX3, Y, 6 * SIZE
vfmul.d VX0, VX0, VXA
vfmul.d VX1, VX1, VXA
vfmadd.d VX2, VX2, VXB, VX0
vfmadd.d VX3, VX3, VXB, VX1
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3
.L112: // ALPHA!=0 BETA==0
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vfmul.d VX0, VX0, VXA
vfmul.d VX1, VX1, VXA
vst VX0, Y, 0 * SIZE
vst VX1, Y, 2 * SIZE
vld VX2, X, 4 * SIZE
vld VX3, X, 6 * SIZE
vfmul.d VX2, VX2, VXA
vfmul.d VX3, VX3, VXA
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
.L113: // ALPHA==0 BETA!=0\
vld VX0, Y, 0 * SIZE
vld VX1, Y, 2 * SIZE
vfmul.d VX0, VX0, VXB
vfmul.d VX1, VX1, VXB
vst VX0, Y, 0 * SIZE
vst VX1, Y, 2 * SIZE
vld VX2, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
vfmul.d VX2, VX2, VXB
vfmul.d VX3, VX3, VXB
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
.L114: // ALPHA==0 BETA==0
vst VXZ, Y, 0 * SIZE
vst VXZ, Y, 2 * SIZE
vst VXZ, Y, 4 * SIZE
vst VXZ, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L120
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
b .L121 // ALPHA!=0 BETA!=0
.align 3
.L120:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L124 // ALPHA==0 BETA==0
b .L123 // ALPHA==0 BETA!=0
.align 3
.L121: // ALPHA!=0 BETA!=0
vld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
vfmul.d VX0, VX0, VXA
vld VX1, X, 2 * SIZE
vfmadd.d VX2, VX2, VXB, VX0
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX1, VX1, VXA
vld VX0, X, 4 * SIZE
vfmadd.d VX3, VX3, VXB, VX1
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX0, VX0, VXA
vld VX1, X, 6 * SIZE
vfmadd.d VX2, VX2, VXB, VX0
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX1, VX1, VXA
vfmadd.d VX3, VX3, VXB, VX1
addi.d I, I, -1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
b .L997
.align 3
.L122: // ALPHA!=0 BETA==0
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vfmul.d VX0, VX0, VXA
vfmul.d VX1, VX1, VXA
vstelm.d VX0, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX0, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
add.d YY, YY, INCY
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfmul.d VX0, VX0, VXA
vfmul.d VX1, VX1, VXA
vstelm.d VX0, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX0, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L122
b .L997
.align 3
.L123: // ALPHA==0 BETA!=0
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vfmul.d VX2, VX2, VXB
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX3, VX3, VXB
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX2, VX2, VXB
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX3, VX3, VXB
addi.d I, I, -1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
blt $r0, I, .L123
b .L997
.align 3
.L124: // ALPHA==0 BETA==0
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L124
b .L997
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L210
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
b .L211 // ALPHA!=0 BETA!=0
.align 3
.L210:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L214 // ALPHA==0 BETA==0
b .L213 // ALPHA==0 BETA!=0
.align 3
.L211: // ALPHA!=0 BETA!=0
vld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCX
vfmul.d VX0, VXA, VX0
vld VX3, Y, 2 * SIZE
vfmadd.d VX2, VX2, VXB, VX0
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX2, Y, 0 * SIZE
vfmul.d VX1, VXA, VX1
vld VX2, Y, 4 * SIZE
vfmadd.d VX3, VX3, VXB, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vst VX3, Y, 2 * SIZE
vfmul.d VX0, VX0, VXA
vld VX3, Y, 6 * SIZE
vfmadd.d VX2, VX2, VXB, VX0
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX2, Y, 4 * SIZE
vfmul.d VX1, VX1, VXA
vfmadd.d VX3, VX3, VXB, VX1
addi.d I, I, -1
vst VX3, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
b .L997
.align 3
.L212: // ALPHA!=0 BETA==0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCX
vfmul.d VX0, VXA, VX0
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX0, Y, 0 * SIZE
vfmul.d VX1, VXA, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vst VX1, Y, 2 * SIZE
vfmul.d VX0, VX0, VXA
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX0, Y, 4 * SIZE
vfmul.d VX1, VX1, VXA
addi.d I, I, -1
vst VX1, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L212
b .L997
.align 3
.L213: // ALPHA==0 BETA!=0
vld VX2, Y, 0 * SIZE
vld VX3, Y, 2 * SIZE
vfmul.d VX2, VX2, VXB
vfmul.d VX3, VX3, VXB
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX2, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
vfmul.d VX2, VX2, VXB
vfmul.d VX3, VX3, VXB
addi.d I, I, -1
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L213
b .L997
.align 3
.L214: // ALPHA==0 BETA==0
vst VXZ, Y, 0 * SIZE
vst VXZ, Y, 2 * SIZE
vst VXZ, Y, 4 * SIZE
vst VXZ, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L214
b .L997
.align 3
.L22:
bge $r0, I, .L997
move YY, Y
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L220
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
b .L221 // ALPHA!=0 BETA!=0
.align 3
.L220:
fcmp.ceq.d $fcc0, BETA, a1
bcnez $fcc0, .L224 // ALPHA==0 BETA==0
b .L223 // ALPHA==0 BETA!=0
.align 3
.L221: // ALPHA!=0 BETA!=0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vinsgr2vr.d VX2, t3, 0
vinsgr2vr.d VX2, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, VX0, VXA
vfmadd.d VX2, VX2, VXB, VX0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vfmul.d VX1, VX1, VXA
vfmadd.d VX3, VX3, VXB, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vinsgr2vr.d VX2, t3, 0
vinsgr2vr.d VX2, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, VX0, VXA
vfmadd.d VX2, VX2, VXB, VX0
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX3, t1, 0
vinsgr2vr.d VX3, t2, 1
add.d Y, Y, INCY
vfmul.d VX1, VX1, VXA
vfmadd.d VX3, VX3, VXB, VX1
addi.d I, I, -1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
blt $r0, I, .L221
b .L997
.align 3
.L222: // ALPHA!=0 BETA==0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCX
vfmul.d VX0, VX0, VXA
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vstelm.d VX0, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX0, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX1, VX1, VXA
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vstelm.d VX1, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX0, VX0, VXA
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vstelm.d VX0, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX0, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX1, VX1, VXA
addi.d I, I, -1
vstelm.d VX1, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
add.d YY, YY, INCY
blt $r0, I, .L222
b .L997
.align 3
.L223: // ALPHA==0 BETA!=0
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vfmul.d VX2, VX2, VXB
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX3, VX3, VXB
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX2, VX2, VXB
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
vfmul.d VX3, VX3, VXB
addi.d I, I, -1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
blt $r0, I, .L223
b .L997
.align 3
.L224: // ALPHA==0 BETA==0
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.d $f12, X, 0 * SIZE
fld.d $f13, Y, 0 * SIZE
addi.d I, I, -1
fmul.d $f12, $f12, ALPHA
fmadd.d $f13, $f13, BETA, $f12
fst.d $f13, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,338 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define XX $r5
#define YY $r6
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $xr8
#define VX1 $xr20
#define VX2 $xr21
#define VX3 $xr22
#define VXA $xr23
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.d.l a1, a1
movgr2fr.d a2, TEMP
ffint.d.l a2, a2
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L999
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.d t1, ALPHA
xvreplgr2vr.d VXA, t1
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L113
fcmp.ceq.d $fcc0, ALPHA, a2
bceqz $fcc0, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX3, Y, 4 * SIZE
xvfadd.d VX2, VX0, VX2
xvfadd.d VX3, VX1, VX3
addi.d I, I, -1
xvst VX2, Y, 0 * SIZE
xvst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
b .L113
.align 3
.L112:
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX3, Y, 4 * SIZE
xvfmadd.d VX2, VX0, VXA, VX2
xvfmadd.d VX3, VX1, VXA, VX3
addi.d I, I, -1
xvst VX2, Y, 0 * SIZE
xvst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L112
.align 3
.L113:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L114:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L114
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
move YY, Y
.align 3
.L121:
xvld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmadd.d VX2, VX0, VXA, VX2
xvld VX1, X, 4 * SIZE
xvstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmadd.d VX3, VX1, VXA, VX3
addi.d I, I, -1
xvstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L212
.align 3
.L211:
xvld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvfmadd.d VX2, VX0, VXA, VX2
xvld VX3, Y, 4 * SIZE
xvst VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvfmadd.d VX3, VX1, VXA, VX3
addi.d I, I, -1
xvst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
move YY, Y
.align 3
.L222:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmadd.d VX2, VX0, VXA, VX2
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX2, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmadd.d VX3, VX1, VXA, VX3
addi.d I, I, -1
xvstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
b .L999
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,365 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define XX $r5
#define YY $r6
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $vr8
#define VX1 $vr20
#define VX2 $vr21
#define VX3 $vr22
#define VXA $vr23
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.d.l a1, a1
movgr2fr.d a2, TEMP
ffint.d.l a2, a2
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L999
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.d t1, ALPHA
vreplgr2vr.d VXA, t1
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L113
fcmp.ceq.d $fcc0, ALPHA, a2
bceqz $fcc0, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 2 * SIZE
vld VX3, Y, 2 * SIZE
vfadd.d VX2, VX0, VX2
vfadd.d VX3, VX1, VX3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX2, Y, 4 * SIZE
vld VX1, X, 6 * SIZE
vld VX3, Y, 6 * SIZE
vfadd.d VX2, VX0, VX2
vfadd.d VX3, VX1, VX3
addi.d I, I, -1
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
b .L113
.align 3
.L112:
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 2 * SIZE
vld VX3, Y, 2 * SIZE
vfmadd.d VX2, VX0, VXA, VX2
vfmadd.d VX3, VX1, VXA, VX3
addi.d I, I, -1
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX2, Y, 4 * SIZE
vld VX1, X, 6 * SIZE
vld VX3, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
vfmadd.d VX2, VX0, VXA, VX2
vfmadd.d VX3, VX1, VXA, VX3
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L112
.align 3
.L113:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L114:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L114
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
move YY, Y
.align 3
.L121:
vld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vfmadd.d VX2, VX0, VXA, VX2
vld VX1, X, 2 * SIZE
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vfmadd.d VX3, VX1, VXA, VX3
vld VX0, X, 4 * SIZE
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vfmadd.d VX2, VX0, VXA, VX2
vld VX1, X, 6 * SIZE
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vfmadd.d VX3, VX1, VXA, VX3
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L212
.align 3
.L211:
vld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCX
vfmadd.d VX2, VX0, VXA, VX2
vld VX3, Y, 2 * SIZE
vst VX2, Y, 0 * SIZE
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vfmadd.d VX3, VX1, VXA, VX3
vld VX2, Y, 4 * SIZE
vst VX3, Y, 2 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCX
vfmadd.d VX2, VX0, VXA, VX2
vld VX3, Y, 6 * SIZE
vst VX2, Y, 4 * SIZE
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vfmadd.d VX3, VX1, VXA, VX3
addi.d I, I, -1
vst VX3, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
move YY, Y
.align 3
.L222:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vfmadd.d VX2, VX0, VXA, VX2
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vfmadd.d VX3, VX1, VXA, VX3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vfmadd.d VX2, VX0, VXA, VX2
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vstelm.d VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX2, YY, 0, 1
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vinsgr2vr.d VX3, t1, 0
vinsgr2vr.d VX3, t2, 1
add.d Y, Y, INCY
vfmadd.d VX3, VX1, VXA, VX3
addi.d I, I, -1
vstelm.d VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.d VX3, YY, 0, 1
add.d YY, YY, INCY
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.d $f14, $f12, $f0, $f14
fst.d $f14, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,224 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $xr12
#define VX1 $xr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvst VX0, Y, 0 * SIZE
xvst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 2
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 3
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 2
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 3
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvst VX0, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
.align 3
.L220:
bge $r0, I, .L223
.align 3
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,232 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $vr12
#define VX1 $vr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vst VX0, Y, 0 * SIZE
vst VX1, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
addi.d I, I, -1
vst VX0, Y, 4 * SIZE
vst VX1, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vst VX0, Y, 0 * SIZE
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX1, Y, 2 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vst VX0, Y, 4 * SIZE
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX1, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
.align 3
.L220:
bge $r0, I, .L223
.align 3
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,175 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
addi.d I, I, -1
xvfmax.d VM1, VX1, VX0
addi.d X, X, 8 * SIZE
xvfmax.d VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmax.d VM1, x1, x2
xvfmax.d VM2, x3, x4
xvfmax.d VM0, VM1, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmax.d m0, VX0, VX1 //patial repeat read
xvpickve.d x1, m0, 0
xvpickve.d x2, m0, 1
xvpickve.d x3, m0, 2
xvpickve.d x4, m0, 3
xvfmax.d VM1, x1, x2
xvfmax.d m0, x3, x4
xvfmax.d m0, m0, VM1
xvfmax.d VM0, m0, VM0
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
addi.d I, I, -1
xvfmax.d VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the max value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
addi.d I, I, -1
xvfmax.d VM1, VX1, VX0
xvfmax.d VM0, VM1, VM0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmax.d VM1, x1, x2
xvfmax.d VM2, x3, x4
xvfmax.d VM0, VM1, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
addi.d I, I, -1
xvfmax.d VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,141 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define x1 $vr9
#define x2 $vr10
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
#define VM2 $vr19
#define VM3 $vr18
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
addi.d I, I, -1
vfmax.d VM1, VX1, VX0
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfmax.d VM2, VX1, VX0
vfmax.d VM3, VM1, VM2
addi.d X, X, 8 * SIZE
vfmax.d VM0, VM0, VM3
blt $r0, I, .L10
.align 3
.L11:
vreplvei.d x2, VM0, 1
vfmax.d VM0, VM0, x2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
.align 3
.L13:
vld x1, X, 0
addi.d I, I, -1
vfmax.d VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L13
.align 3
.L14:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the max value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmax.d VM1, VX0, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
addi.d I, I, -1
vfmax.d VM2, VX0, VX1
vfmax.d VM3, VM1, VM2
vfmax.d VM0, VM0, VM3
blt $r0, I, .L21
.align 3
.L22:
vreplvei.d x2, VM0, 1
vfmax.d VM0, VM0, x2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
addi.d I, I, -1
vfmax.d VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,175 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
addi.d I, I, -1
xvfmin.d VM1, VX1, VX0
addi.d X, X, 8 * SIZE
xvfmin.d VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmin.d VM1, x1, x2
xvfmin.d VM2, x3, x4
xvfmin.d VM0, VM1, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmin.d m0, VX0, VX1 //patial repeat read
xvpickve.d x1, m0, 0
xvpickve.d x2, m0, 1
xvpickve.d x3, m0, 2
xvpickve.d x4, m0, 3
xvfmin.d VM1, x1, x2
xvfmin.d m0, x3, x4
xvfmin.d m0, m0, VM1
xvfmin.d VM0, m0, VM0
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
xvfmin.d VM0, VM0, x1
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the min value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
addi.d I, I, -1
xvfmin.d VM1, VX1, VX0
xvfmin.d VM0, VM1, VM0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmin.d VM1, x1, x2
xvfmin.d VM2, x3, x4
xvfmin.d VM0, VM1, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
xvfmin.d VM0, VM0, x1
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,143 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define x1 $vr9
#define x2 $vr10
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
#define VM2 $vr18
#define VM3 $vr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
addi.d I, I, -1
vfmin.d VM1, VX1, VX0
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfmin.d VM2, VX1, VX0
vfmin.d VM3, VM1, VM2
addi.d X, X, 8 * SIZE
vfmin.d VM0, VM0, VM3
blt $r0, I, .L10
.align 3
.L11:
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
vfmin.d VM0, x1, x2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
.align 3
.L13:
vld x1, X, 0
addi.d I, I, -1
vfmin.d VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L13
.align 3
.L14:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the min value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmin.d VM1, VX0, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
addi.d I, I, -1
vfmin.d VM2, VX0, VX1
vfmin.d VM3, VM1, VM2
vfmin.d VM0, VM0, VM3
blt $r0, I, .L21
.align 3
.L22:
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
vfmin.d VM0, x1, x2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
addi.d I, I, -1
vfmin.d VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,233 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define XX $r19
#define I $r17
#define TEMP $r18
#define t1 $r12
#define t2 $r13
#define t3 $r14
#define t4 $r15
#define VX0 $xr15
#define VX1 $xr16
#define VM0 $xr17
#define VM1 $xr18
#define VM2 $xr13
#define VM3 $xr14
#define res1 $xr19
#define res2 $xr20
#define VALPHA $xr21
#define INF $f23
#define a1 $f22
#define max $f17
#define ALPHA $f12
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
move XX, X
// Init INF
addi.d TEMP, $r0, 0x7FF
slli.d TEMP, TEMP, 52
MTC INF, TEMP
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
xvld VM0, X, 0
bge $r0, I, .L97
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvfmaxa.d VM1, VX1, VX0
xvfmaxa.d VM0, VM0, VM1
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
b .L96
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the maxa value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L97
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t3, 2
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t3, 2
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t4, 3
xvfmaxa.d VM1, VX0, VX1
xvfmaxa.d VM0, VM0, VM1
addi.d I, I, -1
blt $r0, I, .L21
b .L96
.align 3
.L96:
xvpickve.d VX0, VM0, 1
xvpickve.d VX1, VM0, 2
xvpickve.d VM3, VM0, 3
xvfmaxa.d VM1, VX0, VX1
xvfmaxa.d VM2, VM3, VM0
xvfmaxa.d VM0, VM1, VM2
.align 3
.L97:
andi I, N, 7
bge $r0, I, .L99
.align 3
.L98:
xvld VX1, X, 0
xvfmaxa.d VM0, VM0, VX1
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L98
.align 3
.L99:
fabs.d max, max
lu12i.w TEMP, 0x3f800 // 1
movgr2fr.d a1, $r0
movgr2fr.w ALPHA, TEMP
CMPEQ $fcc0, max, a1
fcvt.d.s ALPHA, ALPHA
bcnez $fcc0, .L999
fdiv.d ALPHA, ALPHA, max
CMPEQ $fcc0, INF, ALPHA
bcnez $fcc0, .L999
movfr2gr.d TEMP, ALPHA
xvreplgr2vr.d VALPHA, TEMP
.L100:
li.d TEMP, SIZE
bne INCX, TEMP, .L120
srai.d I, N, 3
bge $r0, I, .L997
.align 3
.L110:
xvld VX0, XX, 0 * SIZE
xvld VX1, XX, 4 * SIZE
xvfmul.d VM0, VX0, VALPHA
xvfmul.d VM1, VX1, VALPHA
xvfmadd.d res1, VM0, VM0, res1
xvfmadd.d res2, VM1, VM1, res2
addi.d XX, XX, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L110
b .L996
.align 3
.L120:
srai.d I, N, 3
bge $r0, I, .L997
.L121:
ld.d t1, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t2, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t3, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t4, XX, 0 * SIZE
add.d XX, XX, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t2, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t3, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t4, XX, 0 * SIZE
add.d XX, XX, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvfmul.d VM0, VX0, VALPHA
xvfmul.d VM1, VX1, VALPHA
xvfmadd.d res1, VM0, VM0, res1
xvfmadd.d res2, VM1, VM1, res2
addi.d I, I, -1
blt $r0, I, .L121
b .L996
.align 3
.L996:
xvfadd.d res1, res1, res2
xvpickve.d VX0, res1, 1
xvpickve.d VX1, res1, 2
xvpickve.d VM0, res1, 3
xvfadd.d res1, VX0, res1
xvfadd.d VX1, VX1, VM0
xvfadd.d res1, VX1, res1
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.d $f15, XX, 0 * SIZE
addi.d I, I, -1
fmul.d $f15, $f15, ALPHA
fmadd.d $f19, $f15, $f15, $f19
add.d XX, XX , INCX
blt $r0, I, .L998
fsqrt.d $f19, $f19
fmul.d $f0, max, $f19
jirl $r0, $r1, 0x0
.align 3
.L999:
fmov.d $f0, $f19
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,242 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define XX $r19
#define I $r17
#define TEMP $r18
#define t1 $r12
#define t2 $r13
#define t3 $r14
#define t4 $r15
#define VX0 $vr15
#define VX1 $vr16
#define VM0 $vr17
#define VM1 $vr18
#define VM2 $vr13
#define VM3 $vr14
#define res1 $vr19
#define res2 $vr20
#define VALPHA $vr21
#define INF $f23
#define a1 $f22
#define max $f17
#define ALPHA $f12
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
move XX, X
// Init INF
addi.d TEMP, $r0, 0x7FF
slli.d TEMP, TEMP, 52
MTC INF, TEMP
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
vld VM0, X, 0
bge $r0, I, .L97
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vfmaxa.d VM1, VX1, VX0
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfmaxa.d VM2, VX1, VX0
vfmaxa.d VM3, VM1, VM2
vfmaxa.d VM0, VM0, VM3
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
b .L96
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the maxa value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L97
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t4, 1
vfmaxa.d VM1, VX0, VX1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t4, 1
vfmaxa.d VM2, VX0, VX1
vfmaxa.d VM3, VM1, VM2
vfmaxa.d VM0, VM0, VM3
addi.d I, I, -1
blt $r0, I, .L21
b .L96
.align 3
.L96:
vreplvei.d VX0, VM0, 0
vreplvei.d VX1, VM0, 1
vfmaxa.d VM0, VX0, VX1
.align 3
.L97:
andi I, N, 7
bge $r0, I, .L99
.align 3
.L98:
vld VX1, X, 0
vfmaxa.d VM0, VM0, VX1
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L98
.align 3
.L99:
fabs.d max, max
lu12i.w TEMP, 0x3f800 // 1
movgr2fr.d a1, $r0
movgr2fr.w ALPHA, TEMP
CMPEQ $fcc0, max, a1
fcvt.d.s ALPHA, ALPHA
bcnez $fcc0, .L999
fdiv.d ALPHA, ALPHA, max
CMPEQ $fcc0, INF, ALPHA
bcnez $fcc0, .L999
movfr2gr.d TEMP, ALPHA
vreplgr2vr.d VALPHA, TEMP
.L100:
li.d TEMP, SIZE
bne INCX, TEMP, .L120
srai.d I, N, 3
bge $r0, I, .L997
.align 3
.L110:
vld VX0, XX, 0 * SIZE
vld VX1, XX, 2 * SIZE
vfmul.d VM0, VX0, VALPHA
vfmul.d VM1, VX1, VALPHA
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
vld VX0, XX, 4 * SIZE
vld VX1, XX, 6 * SIZE
vfmul.d VM0, VX0, VALPHA
vfmul.d VM1, VX1, VALPHA
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
addi.d XX, XX, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L110
b .L996
.align 3
.L120:
srai.d I, N, 3
bge $r0, I, .L997
.L121:
ld.d t1, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t2, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t3, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t4, XX, 0 * SIZE
add.d XX, XX, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmul.d VM0, VX0, VALPHA
ld.d t1, XX, 0 * SIZE
add.d XX, XX, INCX
vfmul.d VM1, VX1, VALPHA
ld.d t2, XX, 0 * SIZE
add.d XX, XX, INCX
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
ld.d t3, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t4, XX, 0 * SIZE
add.d XX, XX, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmul.d VM0, VX0, VALPHA
vfmul.d VM1, VX1, VALPHA
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
addi.d I, I, -1
blt $r0, I, .L121
b .L996
.align 3
.L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.d $f15, XX, 0 * SIZE
addi.d I, I, -1
fmul.d $f15, $f15, ALPHA
fmadd.d $f19, $f15, $f15, $f19
add.d XX, XX , INCX
blt $r0, I, .L998
fsqrt.d $f19, $f19
fmul.d $f0, max, $f19
jirl $r0, $r1, 0x0
.align 3
.L999:
fmov.d $f0, $f19
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,368 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
/* Don't change following FR unless you know the effects. */
#define s1 $f8
#define s2 $f9
#define a1 $f10
#define b1 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
/* init $f8 and $f9 to zero */
SUB s1, s1, s1
SUB s2, s2, s2
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
bne INCX, TEMP, .L20 /* inc_x=1 */
bne INCY, TEMP, .L20 /* inc_y=1 */
/* !((inc_x == 1) && (inc_y == 1)) */
/* init $xr8 and $xr9 to zero */
#ifdef DOUBLE
xvldrepl.d $xr0, X, 0
#else
xvldrepl.w $xr0, X, 0
#endif
#ifdef DSDOT
xvfcvtl.d.s $xr0, $xr0
xvfsub.d $xr8, $xr0, $xr0
xvfsub.d $xr9, $xr0, $xr0
#else
XVFSUB $xr8, $xr0, $xr0
XVFSUB $xr9, $xr0, $xr0
#endif
#ifdef DOUBLE
srai.d I, N, 4
#else
srai.d I, N, 5
#endif
bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */
.align 3
.L11:
/* FLOAT: 32~ ; DOUBLE: 16~ */
xvld $xr0, X, 0
xvld $xr1, X, 32
xvld $xr2, X, 64
xvld $xr3, X, 96
xvld $xr4, Y, 0
xvld $xr5, Y, 32
xvld $xr6, Y, 64
xvld $xr7, Y, 96
addi.w I, I, -1
addi.d X, X, 128
addi.d Y, Y, 128
#ifdef DSDOT
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr1
xvfcvtl.d.s $xr11, $xr5
xvfcvth.d.s $xr12, $xr1
xvfcvth.d.s $xr13, $xr5
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr2
xvfcvtl.d.s $xr11, $xr6
xvfcvth.d.s $xr12, $xr2
xvfcvth.d.s $xr13, $xr6
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr3
xvfcvtl.d.s $xr11, $xr7
xvfcvth.d.s $xr12, $xr3
xvfcvth.d.s $xr13, $xr7
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
XVFMADD $xr9, $xr1, $xr5, $xr9
XVFMADD $xr8, $xr2, $xr6, $xr8
XVFMADD $xr9, $xr3, $xr7, $xr9
#endif
bnez I, .L11
.align 3
.L12:
#ifdef DOUBLE
andi I, N, 0xf
srai.d I, I, 2
#else
andi I, N, 0x1f
srai.d I, I, 3
#endif
bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */
.align 3
.L13:
/* FLOAT: 8~31 ; DOUBLE: 4~15 */
xvld $xr0, X, 0
xvld $xr4, Y, 0
addi.w I, I, -1
addi.d X, X, 32
addi.d Y, Y, 32
#ifdef DSDOT
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
#endif
bnez I, .L13
.align 3
.L14:
/* store dot in s1 $f8 */
#ifdef DSDOT
xvfadd.d $xr8, $xr8, $xr9
fsub.s s2, s2, s2, /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
vfadd.d $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0
#else
XVFADD $xr8, $xr8, $xr9
SUB s2, s2, s2 /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
VFADD $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
#ifdef DOUBLE
VFADD $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0
#endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3
.L15:
#ifdef DOUBLE
andi I, N, 0x3
#else
andi I, N, 0x7
#endif
bge $r0, I, .L999 /* =0 */
.align 3
.L16:
/* FLOAT: 1~7 ; DOUBLE: 1~3 */
LD a1, X, 0
LD b1, Y, 0
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d I, I, -1
addi.d X, X, SIZE
addi.d Y, Y, SIZE
bnez I, .L16
b .L999
.align 3
.L20:
/* !((inc_x == 1) && (inc_y == 1)) */
srai.d I, N, 3
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25 /* <8 */
.align 3
.L23:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
blt $r0, I, .L23
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
blt $r0, I, .L26
.align 3
.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
#else
ADD $f0, s1, s2
#endif
move $r4, $r17
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,364 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
/* Don't change following FR unless you know the effects. */
#define s1 $f8
#define s2 $f9
#define a1 $f10
#define b1 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
/* init $f8 and $f9 to zero */
SUB s1, s1, s1
SUB s2, s2, s2
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
bne INCX, TEMP, .L20 /* inc_x=1 */
bne INCY, TEMP, .L20 /* inc_y=1 */
/* !((inc_x == 1) && (inc_y == 1)) */
/* init $vr8 and $vr9 to zero */
#ifdef DOUBLE
vldrepl.d $vr0, X, 0
#else
vldrepl.w $vr0, X, 0
#endif
#ifdef DSDOT
vfcvtl.d.s $vr0, $vr0
vfsub.d $vr8, $vr0, $vr0
vfsub.d $vr9, $vr0, $vr0
#else
VFSUB $vr8, $vr0, $vr0
VFSUB $vr9, $vr0, $vr0
#endif
#ifdef DOUBLE
srai.d I, N, 3
#else
srai.d I, N, 4
#endif
bge $r0, I, .L12 /* FLOAT: <16 ; DOUBLE: <8 */
.align 3
.L11:
/* FLOAT: 16~ ; DOUBLE: 8~ */
vld $vr0, X, 0
vld $vr1, X, 16
vld $vr2, X, 32
vld $vr3, X, 48
vld $vr4, Y, 0
vld $vr5, Y, 16
vld $vr6, Y, 32
vld $vr7, Y, 48
addi.w I, I, -1
addi.d X, X, 64
addi.d Y, Y, 64
#ifdef DSDOT
vfcvtl.d.s $vr10, $vr0
vfcvtl.d.s $vr11, $vr4
vfcvth.d.s $vr12, $vr0
vfcvth.d.s $vr13, $vr4
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfcvtl.d.s $vr10, $vr1
vfcvtl.d.s $vr11, $vr5
vfcvth.d.s $vr12, $vr1
vfcvth.d.s $vr13, $vr5
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfcvtl.d.s $vr10, $vr2
vfcvtl.d.s $vr11, $vr6
vfcvth.d.s $vr12, $vr2
vfcvth.d.s $vr13, $vr6
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfcvtl.d.s $vr10, $vr3
vfcvtl.d.s $vr11, $vr7
vfcvth.d.s $vr12, $vr3
vfcvth.d.s $vr13, $vr7
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
#else
VFMADD $vr8, $vr0, $vr4, $vr8
VFMADD $vr9, $vr1, $vr5, $vr9
VFMADD $vr8, $vr2, $vr6, $vr8
VFMADD $vr9, $vr3, $vr7, $vr9
#endif
bnez I, .L11
.align 3
.L12:
#ifdef DOUBLE
andi I, N, 0x7
srai.d I, I, 1
#else
andi I, N, 0xf
srai.d I, I, 2
#endif
bge $r0, I, .L14 /* DOUBLE: <2 ; FLOAT: <4 */
.align 3
.L13:
/* FLOAT: 4~15 ; DOUBLE: 2~7 */
vld $vr0, X, 0
vld $vr4, Y, 0
addi.w I, I, -1
addi.d X, X, 16
addi.d Y, Y, 16
#ifdef DSDOT
vfcvtl.d.s $vr10, $vr0
vfcvtl.d.s $vr11, $vr4
vfcvth.d.s $vr12, $vr0
vfcvth.d.s $vr13, $vr4
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
#else
VFMADD $vr8, $vr0, $vr4, $vr8
#endif
bnez I, .L13
.align 3
.L14:
/* store dot in s1 $f8 */
#ifdef DSDOT
vfadd.d $vr8, $vr8, $vr9
fsub.s s2, s2, s2, /* set s2 to 0.0 */
vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr9
SUB s2, s2, s2 /* set s2 to 0.0 */
vpackod.d $vr0, $vr8, $vr8
#ifdef DOUBLE
VFADD $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0
#endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3
.L15:
#ifdef DOUBLE
andi I, N, 0x1
#else
andi I, N, 0x3
#endif
bge $r0, I, .L999 /* =0 */
.align 3
.L16:
/* DOUBLE: 1 ; FLOAT: 1~3 */
LD a1, X, 0
LD b1, Y, 0
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d I, I, -1
addi.d X, X, SIZE
addi.d Y, Y, SIZE
bnez I, .L16
b .L999
.align 3
.L20:
/* !((inc_x == 1) && (inc_y == 1)) */
srai.d I, N, 3
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25 /* <8 */
.align 3
.L23:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
blt $r0, I, .L23
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
blt $r0, I, .L26
.align 3
.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
#else
ADD $f0, s1, s2
#endif
move $r4, $r17
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,927 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define C $f0
#define S $f1
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define VX0 $xr8
#define VX1 $xr20
#define VX2 $xr21
#define VX3 $xr22
#define VT0 $xr10
#define VT1 $xr18
#define VXC $xr23
#define VXS $xr9
#define VXZ $xr19
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.d.l a1, a1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.d t1, C
xvreplgr2vr.d VXC, t1
movfr2gr.d t2, S
xvreplgr2vr.d VXS, t2
movfr2gr.d t3, a1
xvreplgr2vr.d VXZ, t3
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
fcmp.ceq.d $fcc0, C, a1
bcnez $fcc0, .L110
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L112 // C!=0 S==0
b .L111 // C!=0 S!=0
.align 3
.L110:
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L114 // C==0 S==0
b .L113 // C==0 S!=0
.align 3
.L111: // C!=0 S!=0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX3, Y, 4 * SIZE
xvfmul.d VT0, VX0, VXC
xvfmadd.d VT0, VX2, VXS, VT0
xvfmul.d VT1, VX0, VXS
xvfmsub.d VT1, VX2, VXC, VT1
xvst VT0, X, 0 * SIZE
xvst VT1, Y, 0 * SIZE
xvfmul.d VT0, VX1, VXC
xvfmadd.d VT0, VX3, VXS, VT0
xvfmul.d VT1, VX1, VXS
xvfmsub.d VT1, VX3, VXC, VT1
xvst VT0, X, 4 * SIZE
xvst VT1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3
.L112: // C!=0 S==0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX3, Y, 4 * SIZE
xvfmul.d VT0, VX0, VXC
xvfmul.d VT1, VX2, VXC
xvst VT0, X, 0 * SIZE
xvst VT1, Y, 0 * SIZE
xvfmul.d VT0, VX1, VXC
xvfmul.d VT1, VX3, VXC
xvst VT0, X, 4 * SIZE
xvst VT1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
.L113: // C==0 S!=0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX3, Y, 4 * SIZE
xvfmul.d VT0, VX2, VXS
xvfmul.d VT1, VX0, VXS
xvfsub.d VT1, VXZ, VT1
xvst VT0, X, 0 * SIZE
xvst VT1, Y, 0 * SIZE
xvfmul.d VT0, VX3, VXS
xvfmul.d VT1, VX1, VXS
xvfsub.d VT1, VXZ, VT1
xvst VT0, X, 4 * SIZE
xvst VT1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
.L114: // C==0 S==0
xvst VXZ, X, 0 * SIZE
xvst VXZ, Y, 0 * SIZE
xvst VXZ, X, 4 * SIZE
xvst VXZ, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
move XX, X
fcmp.ceq.d $fcc0, C, a1
bcnez $fcc0, .L120
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L122 // C!=0 S==0
b .L121 // C!=0 S!=0
.align 3
.L120:
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L124 // C==0 S==0
b .L123 // C==0 S!=0
.align 3
.L121: // C!=0 S!=0
xvld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX0, VXC
xvfmadd.d VT0, VX2, VXS, VT0
xvfmul.d VT1, VX0, VXS
xvfmsub.d VT1, VX2, VXC, VT1
xvld VX1, X, 4 * SIZE
xvst VT0, X, 0 * SIZE
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX1, VXC
xvfmadd.d VT0, VX3, VXS, VT0
xvfmul.d VT1, VX1, VXS
xvfmsub.d VT1, VX3, VXC, VT1
addi.d I, I, -1
xvst VT0, X, 4 * SIZE
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
b .L997
.align 3
.L122: // C!=0 S==0
xvld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX0, VXC
xvfmul.d VT1, VX2, VXC
xvld VX1, X, 4 * SIZE
xvst VT0, X, 0 * SIZE
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX1, VXC
xvfmul.d VT1, VX3, VXC
addi.d I, I, -1
xvst VT0, X, 4 * SIZE
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L122
b .L997
.align 3
.L123: // C==0 S!=0
xvld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX2, VXS
xvfmul.d VT1, VX0, VXS
xvfsub.d VT1, VXZ, VT1
xvst VT0, X, 0 * SIZE
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
xvld VX1, X, 4 * SIZE
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX3, VXS
xvfmul.d VT1, VX1, VXS
xvfsub.d VT1, VXZ, VT1
addi.d I, I, -1
xvst VT0, X, 4 * SIZE
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L123
b .L997
.align 3
.L124: // C==0 S==0
xvst VXZ, X, 0 * SIZE
xvst VXZ, X, 4 * SIZE
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L124
b .L997
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
move XX, X
fcmp.ceq.d $fcc0, C, a1
bcnez $fcc0, .L210
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L212 // C!=0 S==0
b .L211 // C!=0 S!=0
.align 3
.L210:
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L214 // C==0 S==0
b .L213 // C==0 S!=0
.align 3
.L211: // C!=0 S!=0
xvld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvfmul.d VT0, VXC, VX0
xvfmadd.d VT0, VX2, VXS, VT0
xvfmul.d VT1, VXS, VX0
xvfmsub.d VT1, VX2, VXC, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvst VT1, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvfmul.d VT0, VX1, VXC
xvfmadd.d VT0, VX3, VXS, VT0
xvfmul.d VT1, VX1, VXS
xvfmsub.d VT1, VX3, VXC, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
b .L997
.align 3
.L212: // C!=0 S==0
xvld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvfmul.d VT0, VXC, VX0
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvfmul.d VT1, VX2, VXC
xvst VT1, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvfmul.d VT0, VX1, VXC
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvfmul.d VT1, VX3, VXS
xvst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L212
b .L997
.align 3
.L213: // C==0 S!=0
xvld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvfmul.d VT0, VXS, VX2
xvfmul.d VT1, VXS, VX0
xvfsub.d VT1, VXZ, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvst VT1, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvfmul.d VT0, VX3, VXS
xvfmul.d VT1, VX1, VXS
xvfsub.d VT1, VXZ, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L213
b .L997
.align 3
.L214: // C==0 S==0
xvstelm.d VXZ, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 3
add.d XX, XX, INCX
xvst VT1, Y, 0 * SIZE
xvstelm.d VXZ, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 3
add.d XX, XX, INCX
xvst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
b .L997
.align 3
.L22:
bge $r0, I, .L997
move YY, Y
move XX, X
fcmp.ceq.d $fcc0, C, a1
bcnez $fcc0, .L220
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L222 // C!=0 S==0
b .L221 // C!=0 S!=0
.align 3
.L220:
fcmp.ceq.d $fcc0, S, a1
bcnez $fcc0, .L224 // C==0 S==0
b .L223 // C==0 S!=0
.align 3
.L221: // C!=0 S!=0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX0, VXC
xvfmadd.d VT0, VX2, VXS, VT0
xvfmul.d VT1, VX0, VXS
xvfmsub.d VT1, VX2, VXC, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX1, VXC
xvfmadd.d VT0, VX3, VXS, VT0
xvfmul.d VT1, VX0, VXS
xvfmsub.d VT1, VX3, VXC, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L221
b .L997
.align 3
.L222: // C!=0 S==0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX0, VXC
xvfmul.d VT1, VX2, VXC
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX1, VXC
xvfmul.d VT1, VX3, VXC
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L222
b .L997
.align 3
.L223: // C==0 S!=0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX2, VXS
xvfmul.d VT1, VX0, VXS
xvfsub.d VT1, VXZ, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
ld.d t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvfmul.d VT0, VX3, VXS
xvfmul.d VT1, VX0, VXS
xvfsub.d VT1, VXZ, VT1
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L223
b .L997
.align 3
.L224: // C==0 S==0
xvstelm.d VXZ, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.d VXZ, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VXZ, XX, 0, 3
add.d XX, XX, INCX
xvstelm.d VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.d VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.d $f12, X, 0 * SIZE
fld.d $f13, Y, 0 * SIZE
fmul.d $f10, $f12, C
fmadd.d $f10, $f13, S, $f10
fst.d $f10, X, 0 * SIZE
addi.d I, I, -1
fmul.d $f20, $f12, S
fmsub.d $f20, $f13, C, $f20
fst.d $f20, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,194 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define XX $r16
#define VX0 $xr12
#define VX1 $xr13
#define VT0 $xr14
#define VT1 $xr15
#define VALPHA $xr19
#define a1 $f8
#define a2 $f23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.d.l a1, a1
movgr2fr.d a2, TEMP
ffint.d.l a2, a2
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L20 //ALPHA==0
fcmp.ceq.d $fcc0, ALPHA, a2
bcnez $fcc0, .L999 //ALPHA==1 return
srai.d I, N, 3
beq INCX, TEMP, .L30 //ALPHA=0|1 and INCX==1
movfr2gr.d TEMP, ALPHA
xvreplgr2vr.d VALPHA, TEMP
move XX, X
.align 3
.L10:
bge $r0, I, .L32
.align 3
.L11:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvfmul.d VT0, VX0, VALPHA
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT0, XX, 0, 3
add.d XX, XX, INCX
xvfmul.d VT1, VX1, VALPHA
xvstelm.d VT1, XX, 0, 0
add.d XX, XX, INCX
xvstelm.d VT1, XX, 0, 1
add.d XX, XX, INCX
xvstelm.d VT1, XX, 0, 2
add.d XX, XX, INCX
xvstelm.d VT1, XX, 0, 3
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L11
b .L32
.align 3
.L20:
srai.d I, N, 3
beq INCX, TEMP, .L24
bge $r0, I, .L22
.align 3
.L21:
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L23:
fst.d a1, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
jirl $r0, $r1, 0
.align 3
.L24:
bge $r0, I, .L26 /*N<8 INCX==1*/
.align 3
.L25:
xvxor.v VX0, VX0, VX0
xvst VX0, X, 0 * SIZE
xvst VX0, X, 4 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L25
.align 3
.L26:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L27:
fst.d a1, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L27
jirl $r0, $r1, 0
.align 3
.L30:
bge $r0, I, .L32/*N<8 INCX==1*/
movfr2gr.d TEMP, ALPHA
xvreplgr2vr.d VALPHA , TEMP
.align 3
.L31:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvfmul.d VT0, VX0, VALPHA
xvfmul.d VT1, VX1, VALPHA
addi.d I, I, -1
xvst VT0, X, 0 * SIZE
xvst VT1, X, 4 * SIZE
addi.d X, X, 8 * SIZE
blt $r0, I, .L31
.align 3
.L32:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L33:
fld.d a1, X, 0 * SIZE
addi.d I, I, -1
fmul.d a1, ALPHA, a1
fst.d a1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L33
jirl $r0, $r1, 0
.align 3
.L999:
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,205 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define XX $r16
#define VX0 $vr12
#define VX1 $vr13
#define VT0 $vr14
#define VT1 $vr15
#define VALPHA $vr19
#define a1 $f8
#define a2 $f23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.d.l a1, a1
movgr2fr.d a2, TEMP
ffint.d.l a2, a2
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
fcmp.ceq.d $fcc0, ALPHA, a1
bcnez $fcc0, .L20 //ALPHA==0
fcmp.ceq.d $fcc0, ALPHA, a2
bcnez $fcc0, .L999 //ALPHA==1 return
srai.d I, N, 3
beq INCX, TEMP, .L30 //ALPHA=0|1 and INCX==1
movfr2gr.d TEMP, ALPHA
vreplgr2vr.d VALPHA, TEMP
move XX, X
.align 3
.L10: //ALPHA=0|1 and INCX!=1
bge $r0, I, .L32
.align 3
.L11:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vfmul.d VT0, VX0, VALPHA
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
vfmul.d VT1, VX1, VALPHA
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vstelm.d VT1, XX, 0, 0
add.d XX, XX, INCX
vstelm.d VT1, XX, 0, 1
add.d XX, XX, INCX
vfmul.d VT0, VX0, VALPHA
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vstelm.d VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.d VT0, XX, 0, 1
add.d XX, XX, INCX
vfmul.d VT1, VX1, VALPHA
vstelm.d VT1, XX, 0, 0
add.d XX, XX, INCX
vstelm.d VT1, XX, 0, 1
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L11
b .L32
.align 3
.L20:
srai.d I, N, 3
beq INCX, TEMP, .L24
bge $r0, I, .L22
.align 3
.L21:
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
fst.d a1, X, 0
add.d X, X, INCX
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L23:
fst.d a1, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
jirl $r0, $r1, 0
.align 3
.L24:
bge $r0, I, .L26 /*N<8 INCX==1*/
.align 3
.L25:
vxor.v VX0, VX0, VX0
vst VX0, X, 0 * SIZE
vst VX0, X, 2 * SIZE
vst VX0, X, 4 * SIZE
vst VX0, X, 6 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L25
.align 3
.L26:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L27:
fst.d a1, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L27
jirl $r0, $r1, 0
.align 3
.L30:
bge $r0, I, .L32/*N<8 INCX==1*/
movfr2gr.d TEMP, ALPHA
vreplgr2vr.d VALPHA , TEMP
.align 3
.L31:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vfmul.d VT0, VX0, VALPHA
vfmul.d VT1, VX1, VALPHA
vld VX0, X, 4 * SIZE
vst VT0, X, 0 * SIZE
vst VT1, X, 2 * SIZE
vfmul.d VT0, VX0, VALPHA
vld VX1, X, 6 * SIZE
vst VT0, X, 4 * SIZE
vfmul.d VT1, VX1, VALPHA
vst VT1, X, 6 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L31
.align 3
.L32:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L33:
fld.d a1, X, 0 * SIZE
addi.d I, I, -1
fmul.d a1, ALPHA, a1
fst.d a1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L33
jirl $r0, $r1, 0
.align 3
.L999:
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,125 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
#define res1 $xr16
#define res2 $xr17
PROLOGUE
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvfadd.d res2, VX0, VX1
xvfadd.d res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fadd.d $f16, $f12, $f16
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvfadd.d res2, VX0, VX1
xvfadd.d res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.d $f12, X, 0 * SIZE
fadd.d $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,123 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
#define res1 $vr16
#define res2 $vr17
PROLOGUE
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.d $f12, X, 0 * SIZE
fadd.d $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
add.d X, X, INCX
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t3, 0
vinsgr2vr.d VX0, t4, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.d $f12, X, 0 * SIZE
fadd.d $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,301 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
addi.d I, I, -1
xvst VX2, X, 0 * SIZE
xvst VX3, X, 4 * SIZE
xvst VX0, Y, 0 * SIZE
xvst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 2
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 3
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvst VX2, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
ld.d t1, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 2
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 3
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
xvld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
xvstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
xvstelm.d VX2, X, 0, 1
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
xvstelm.d VX2, X, 0, 2
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvstelm.d VX2, X, 0, 3
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvst VX0, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
ld.d t1, X, 0 * SIZE
xvstelm.d VX3, X, 0, 0
add.d X, X, INCY
ld.d t2, X, 0 * SIZE
xvstelm.d VX3, X, 0, 1
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
xvstelm.d VX3, X, 0, 2
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvstelm.d VX3, X, 0, 3
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvst VX1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
//addi.d TEMP, N, -1
//mul.d TEMP, TEMP, INCX
//sub.d X, X, TEMP
.align 3
.L220:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,317 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vld VX2, Y, 0 * SIZE
vld VX3, Y, 2 * SIZE
vst VX2, X, 0 * SIZE
vst VX3, X, 2 * SIZE
vst VX0, Y, 0 * SIZE
vst VX1, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vld VX2, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
addi.d I, I, -1
vst VX2, X, 4 * SIZE
vst VX3, X, 6 * SIZE
vst VX0, Y, 4 * SIZE
vst VX1, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 1
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vst VX2, X, 0 * SIZE
vld VX1, X, 2 * SIZE
ld.d t3, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 1
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vst VX3, X, 2 * SIZE
vld VX0, X, 4 * SIZE
ld.d t1, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 1
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vst VX2, X, 4 * SIZE
vld VX1, X, 6 * SIZE
ld.d t3, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 1
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vst VX3, X, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
vld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
vstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vstelm.d VX2, X, 0, 1
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCY
vst VX0, Y, 0 * SIZE
vld VX3, Y, 2 * SIZE
ld.d t3, X, 0 * SIZE
vstelm.d VX3, X, 0, 0
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vstelm.d VX3, X, 0, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vst VX1, Y, 2 * SIZE
vld VX2, Y, 4 * SIZE
ld.d t1, X, 0 * SIZE
vstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vstelm.d VX2, X, 0, 1
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCY
vst VX0, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
ld.d t3, X, 0 * SIZE
vstelm.d VX3, X, 0, 0
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vstelm.d VX3, X, 0, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vst VX1, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
//addi.d TEMP, N, -1
//mul.d TEMP, TEMP, INCX
//sub.d X, X, TEMP
.align 3
.L220:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,275 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvadd.d VI1, VI1, VINC8
xvld VX1, X, 4 * SIZE
xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L10
.align 3
.L15:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfcmp.ceq.d VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.d VM0, x4, x3
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L24
.align 3
.L25:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfcmp.ceq.d VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.d VM0, x4, x3
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.d VT0, VM0, x2
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.d VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.d VT0, VM0, x3
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.d VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.d VT0, VM0, x4
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.d VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.d i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
xvfmaxa.d VM1, x1, VM0
xvfcmp.ceq.d VT0, VM0, VM1
add.d X, X, INCX
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,267 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC2 $vr17
#define VINC4 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L11
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.d VI1, VI1, VINC4
vld VX1, X, 2 * SIZE
vadd.d VI2, VI1, VINC2
vfmaxa.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0
vld VX0, X, 4 * SIZE
vadd.d VI1, VI2, VINC2
vld VX1, X, 6 * SIZE
vadd.d VI2, VI1, VINC2
vfmaxa.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmaxa.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
vbitsel.v x2, x4, x2, VT0
vfmaxa.d VM1, VM0, x3
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, x2, VI0, VT0
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
.L15:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L16
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L17
.align 3
.L16:
vfmaxa.d VM0, x1, x2
vfcmp.ceq.d VT0, x1, VM0
vbitsel.v VI0, VI2, VI1, VT0
.align 3
.L17:
movfr2gr.d i0, $f20
.align 3
.L11: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L13:
fld.d $f9, X, 0
vfmaxa.d VM1, x1, VM0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
addi.d I, I, -1
addi.d i1, i1, 1
addi.d X, X, SIZE
movgr2fr.d $f21, i1
blt $r0, I, .L13
movfr2gr.d i0, $f20
.align 3
.L14:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI1, VINC4
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfmaxa.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI2, VINC2
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfmaxa.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmaxa.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
vbitsel.v x2, x4, x2, VT0
vfmaxa.d VM1, VM0, x3
vbitsel.v VM0, VM1, VM0, VT0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VI0, x2, VI0, VT0
addi.d I, I, -1
blt $r0, I, .L24
.align 3
.L25:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L27
.align 3
.L26:
vfmaxa.d VM0, x1, x2
vfcmp.ceq.d VT0, x1, VM0
vbitsel.v VI0, VI2, VI1, VT0
.align 3
.L27:
movfr2gr.d i0, $f20
.align 3
.L21: // N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
vfmaxa.d VM1, x1, VM0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
addi.d I, I, -1
addi.d i1, i1, 1
add.d X, X, INCX
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,275 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvadd.d VI1, VI1, VINC8
xvld VX1, X, 4 * SIZE
xvadd.d VI2, VI1, VINC4
xvfmina.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmina.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L10
.align 3
.L15:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmina.d VM1, x1, x2
xvfcmp.ceq.d VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmina.d VM0, x4, x3
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmina.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvadd.d VI2, VI1, VINC4
xvfmina.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmina.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
addi.d I, I, -1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L24
.align 3
.L25:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmina.d VM1, x1, x2
xvfcmp.ceq.d VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmina.d VM0, x4, x3
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmina.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.d VT0, VM0, x2
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.d VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.d VT0, VM0, x3
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.d VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.d VT0, VM0, x4
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.d VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.d i0, $f20
.align 3
.L21: // N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
xvfmina.d VM1, x1, VM0
xvfcmp.ceq.d VT0, VM0, VM1
add.d X, X, INCX
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,228 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC2 $vr17
#define VINC4 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.d VI1, VI1, VINC4
vld VX1, X, 2 * SIZE
vadd.d VI2, VI1, VINC2
vfmina.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0
vld VX0, X, 4 * SIZE
vadd.d VI1, VI2, VINC2
vld VX1, X, 6 * SIZE
vadd.d VI2, VI1, VINC2
vfmina.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmina.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
addi.d I, I, -1
vbitsel.v x2, x4, x2, VT0
vfmina.d VM1, VM0, x3
vfcmp.ceq.d VT0, VM0, VM1
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, x2, VI0, VT0
blt $r0, I, .L10
.align 3
.L15:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L27
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI1, VINC4
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfmina.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI2, VINC2
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfmina.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmina.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
addi.d I, I, -1
vbitsel.v x2, x4, x2, VT0
vfmina.d VM1, VM0, x3
vbitsel.v VM0, VM1, VM0, VT0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VI0, x2, VI0, VT0
blt $r0, I, .L24
.align 3
.L25:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L27
.align 3
.L26:
vfmina.d VM0, x1, x2
vfcmp.ceq.d VT0, x1, VM0
vbitsel.v VI0, VI2, VI1, VT0
.align 3
.L27:
movfr2gr.d i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
vfmina.d VM1, x1, VM0
vfcmp.ceq.d VT0, VM0, VM1
add.d X, X, INCX
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,273 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvadd.d VI1, VI1, VINC8
xvld VX1, X, 4 * SIZE
xvadd.d VI2, VI1, VINC4
xvfcmp.clt.d VT0, VX0, VX1
addi.d I, I, -1
xvbitsel.v VM1, VX0, VX1, VT0
xvbitsel.v VI2, VI1, VI2, VT0
xvfcmp.clt.d VT0, VM0, VM1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L10
.align 3
.L15:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.d VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.d VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvadd.d VI2, VI1, VINC4
xvfcmp.clt.d VT0, VX0, VX1
addi.d I, I, -1
xvbitsel.v VM1, VX0, VX1, VT0
xvbitsel.v VI2, VI1, VI2, VT0
xvfcmp.clt.d VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L24
.align 3
.L25:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.d VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.d VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.d VT0, VM0, x2
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.d VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.d VT0, VM0, x3
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.d VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.d VT0, VM0, x4
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.d VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.d i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
fcmp.clt.d $fcc0, $f15, $f9
add.d X, X, INCX
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,225 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC2 $vr17
#define VINC4 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.d VI1, VI1, VINC4
vld VX1, X, 2 * SIZE
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0
vld VX0, X, 4 * SIZE
vadd.d VI1, VI2, VINC2
vld VX1, X, 6 * SIZE
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX0, VX1
addi.d I, I, -1
vbitsel.v x3, VX0, VX1, VT0
vbitsel.v x4, VI1, VI2, VT0
vfcmp.clt.d VT0, x1, x3
vbitsel.v x1, x1, x3, VT0
vbitsel.v x2, x2, x4, VT0
vfcmp.clt.d VT0, VM0, x1
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L10
.align 3
.L15:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L27
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI1, VINC4
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI2, VINC2
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x3, VX0, VX1, VT0
vbitsel.v x4, VI1, VI2, VT0
vfcmp.clt.d VT0, x1, x3
vbitsel.v x1, x1, x3, VT0
vbitsel.v x2, x2, x4, VT0
vfcmp.clt.d VT0, VM0, x1
addi.d I, I, -1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L24
.align 3
.L25:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L27
.align 3
.L26:
vfcmp.clt.d VT0, x1, x2
vbitsel.v VM0, x1, x2, VT0
vbitsel.v VI0, VI1, VI2, VT0
.align 3
.L27:
movfr2gr.d i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
fcmp.clt.d $fcc0, $f15, $f9
add.d X, X, INCX
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,272 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvadd.d VI1, VI1, VINC8
xvld VX1, X, 4 * SIZE
xvadd.d VI2, VI1, VINC4
xvfcmp.clt.d VT0, VX1, VX0
addi.d I, I, -1
xvbitsel.v VM1, VX0, VX1, VT0
xvbitsel.v VI2, VI1, VI2, VT0
xvfcmp.clt.d VT0, VM1, VM0
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L10
.align 3
.L15:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfcmp.clt.d VT0, x2, x1
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.d VT0, x4, x3
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.d VT0, VM1, VM0
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
slli.d i0, i0, 2 //4
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvadd.d VI2, VI1, VINC4
xvfcmp.clt.d VT0, VX1, VX0
addi.d I, I, -1
xvbitsel.v VM1, VX0, VX1, VT0
xvbitsel.v VI2, VI1, VI2, VT0
xvfcmp.clt.d VT0, VM1, VM0
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L24
.align 3
.L25:
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfcmp.clt.d VT0, x2, x1
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.d VT0, x4, x3
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.d VT0, VM1, VM0
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
xvfcmp.ceq.d VT0, VM0, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.d VT0, VM0, x2
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.d VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.d VT0, VM0, x3
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.d VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.d VT0, VM0, x4
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.d VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.d i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
fcmp.clt.d $fcc0, $f9, $f15
add.d X, X, INCX
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,225 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC2 $vr17
#define VINC4 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.d VI1, VI1, VINC4
vld VX1, X, 2 * SIZE
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX1, VX0
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0
vld VX0, X, 4 * SIZE
vadd.d VI1, VI2, VINC2
vld VX1, X, 6 * SIZE
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX1, VX0
addi.d I, I, -1
vbitsel.v x3, VX0, VX1, VT0
vbitsel.v x4, VI1, VI2, VT0
vfcmp.clt.d VT0, x3, x1
addi.d X, X, 8 * SIZE
vbitsel.v x1, x1, x3, VT0
vbitsel.v x2, x2, x4, VT0
vfcmp.clt.d VT0, x1, VM0
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L10
.align 3
.L15:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L27
.align 3
.L20: // INCX!=1
move TEMP, X
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
.align 3
.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI1, VINC4
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX1, VX0
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vadd.d VI1, VI2, VINC2
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfcmp.clt.d VT0, VX1, VX0
vbitsel.v x3, VX0, VX1, VT0
vbitsel.v x4, VI1, VI2, VT0
vfcmp.clt.d VT0, x3, x1
vbitsel.v x1, x1, x3, VT0
vbitsel.v x2, x2, x4, VT0
vfcmp.clt.d VT0, x1, VM0
addi.d I, I, -1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L24
.align 3
.L25:
vreplvei.d VI1, VI0, 0
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
li.d TEMP, 1 //
movgr2fr.d $f17, TEMP
ffint.d.l $f17, $f17
vfcmp.ceq.d VT0, x2, x1
fcmp.ceq.d $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.d VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L27
.align 3
.L26:
vfcmp.clt.d VT0, x2, x1
vbitsel.v VM0, x1, x2, VT0
vbitsel.v VI0, VI1, VI2, VT0
.align 3
.L27:
movfr2gr.d i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
fcmp.clt.d $fcc0, $f9, $f15
add.d X, X, INCX
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.d i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,378 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L10:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VM1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
blt $r0, I, .L10
.align 3
.L15:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM1, VM0
addi.d I, I, -1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
blt $r0, I, .L24
.align 3
.L25:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
fmov.s $f16, $f20
.align 3
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L262:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L272
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L272:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L282
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L282:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L292
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L292:
xvfmaxa.s VM0, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VX0
xvbitsel.v VI0, VI0, VI1, VT0
movfr2gr.s i0, $f20
.L21: // N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.s $f9, X, 0
addi.d I, I, -1
xvfmaxa.s VM1, x1, VM0
xvfcmp.ceq.s VT0, VM0, VM1
add.d X, X, INCX
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,275 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC4 $vr17
#define VINC8 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.w VI1, VI1, VINC8
vld VX1, X, 4 * SIZE
vadd.w VI2, VI1, VINC4
vfmaxa.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
addi.d I, I, -1
vbitsel.v VI2, VI2, VI1, VT0
vfmaxa.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L10
.align 3
.L15:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC4, VI2, VI1, VT0
vfmaxa.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC8, VI4, VI3, VT0
vfmaxa.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vadd.w VI2, VI1, VINC4
vfmaxa.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
vbitsel.v VI2, VI2, VI1, VT0
vfmaxa.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
addi.d I, I, -1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L24
.align 3
.L25:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC4, VI2, VI1, VT0
vfmaxa.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC8, VI4, VI3, VT0
vfmaxa.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
vfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
vfcmp.clt.s VT0, VI2, VI0
vbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
vfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
vfcmp.clt.s VT0, VI3, VI0
vbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
vfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
vfcmp.clt.s VT0, VI4, VI0
vbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.s i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.s $f9, X, 0
addi.d I, I, -1
vfmaxa.s VM1, x1, VM0
vfcmp.ceq.s VT0, VM0, VM1
add.d X, X, INCX
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,378 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L10:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvadd.w VI1, VI1, VINC8
xvfmina.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VM1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
blt $r0, I, .L10
.align 3
.L15:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfmina.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmina.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmina.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvadd.w VI1, VI1, VINC8
xvfmina.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM1, VM0
addi.d I, I, -1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
blt $r0, I, .L24
.align 3
.L25:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfmina.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmina.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfmina.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
fmov.s $f16, $f20
.align 3
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfmina.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmina.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmina.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L262:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L272
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L272:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L282
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L282:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L292
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L292:
xvfmina.s VM0, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VX0
xvbitsel.v VI0, VI0, VI1, VT0
movfr2gr.s i0, $f20
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.s $f9, X, 0
addi.d I, I, -1
xvfmina.s VM1, x1, VM0
xvfcmp.ceq.s VT0, VM0, VM1
add.d X, X, INCX
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,275 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC4 $vr17
#define VINC8 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.w VI1, VI1, VINC8
vld VX1, X, 4 * SIZE
vadd.w VI2, VI1, VINC4
vfmina.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
addi.d I, I, -1
vbitsel.v VI2, VI2, VI1, VT0
vfmina.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L10
.align 3
.L15:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmina.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC4, VI2, VI1, VT0
vfmina.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC8, VI4, VI3, VT0
vfmina.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vadd.w VI2, VI1, VINC4
vfmina.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
vbitsel.v VI2, VI2, VI1, VT0
vfmina.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
addi.d I, I, -1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
blt $r0, I, .L24
.align 3
.L25:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmina.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC4, VI2, VI1, VT0
vfmina.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC8, VI4, VI3, VT0
vfmina.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
vfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
vfcmp.clt.s VT0, VI2, VI0
vbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
vfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
vfcmp.clt.s VT0, VI3, VI0
vbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
vfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
vfcmp.clt.s VT0, VI4, VI0
vbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.s i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.s $f9, X, 0
addi.d I, I, -1
vfmina.s VM1, x1, VM0
vfcmp.ceq.s VT0, VM0, VM1
add.d X, X, INCX
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,375 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvadd.w VI1, VI1, VINC8
xvfcmp.clt.s VT0, VM0, VX0
addi.d I, I, -1
xvbitsel.v VM0, VM0, VX0, VT0
xvbitsel.v VI0, VI0, VI1, VT0
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
.L15:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvadd.w VI1, VI1, VINC8
xvfcmp.clt.s VT0, VM0, VX0
addi.d I, I, -1
xvbitsel.v VM0, VM0, VX0, VT0
xvbitsel.v VI0, VI0, VI1, VT0
blt $r0, I, .L24
.align 3
.L25:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
fmov.s $f16, $f20
.align 3
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v x1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L262:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L272
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L272:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L282
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L282:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L292
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L292:
fcmp.clt.s $fcc0, $f15, $f13
fsel $f15, $f15, $f13, $fcc0
fsel $f20, $f20, $f16, $fcc0
movfr2gr.s i0, $f20
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
fcmp.clt.s $fcc0, $f15, $f9
add.d X, X, INCX
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,272 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC4 $vr17
#define VINC8 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.w VI1, VI1, VINC8
vld VX1, X, 4 * SIZE
vadd.w VI2, VI1, VINC4
vfcmp.clt.s VT0, VX0, VX1
addi.d I, I, -1
vbitsel.v VM1, VX0, VX1, VT0
vbitsel.v VI2, VI1, VI2, VT0
vfcmp.clt.s VT0, VM0, VM1
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L10
.align 3
.L15:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfcmp.clt.s VT0, x1, x2
vbitsel.v VM1, x1, x2, VT0
vbitsel.v VINC4, VI1, VI2, VT0
vfcmp.clt.s VT0, x3, x4
vbitsel.v VM0, x3, x4, VT0
vbitsel.v VINC8, VI3, VI4, VT0
vfcmp.clt.s VT0, VM0, VM1
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vadd.w VI2, VI1, VINC4
vfcmp.clt.s VT0, VX0, VX1
addi.d I, I, -1
vbitsel.v VM1, VX0, VX1, VT0
vbitsel.v VI2, VI1, VI2, VT0
vfcmp.clt.s VT0, VM0, VM1
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L24
.align 3
.L25:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfcmp.clt.s VT0, x1, x2
vbitsel.v VM1, x1, x2, VT0
vbitsel.v VINC4, VI1, VI2, VT0
vfcmp.clt.s VT0, x3, x4
vbitsel.v VM0, x3, x4, VT0
vbitsel.v VINC8, VI3, VI4, VT0
vfcmp.clt.s VT0, VM0, VM1
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
vfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
vfcmp.clt.s VT0, VI2, VI0
vbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
vfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
vfcmp.clt.s VT0, VI3, VI0
vbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
vfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
vfcmp.clt.s VT0, VI4, VI0
vbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.s i0, $f20
.align 3
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
fcmp.clt.s $fcc0, $f15, $f9
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d i1, i1, 1
add.d X, X, INCX
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,374 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define VX0 $xr13
#define VX1 $xr14
#define VM0 $xr15
#define VM1 $xr16
#define VINC4 $xr17
#define VINC8 $xr18
#define VI0 $xr20
#define VI1 $xr21
#define VI2 $xr22
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvadd.w VI1, VI1, VINC8
xvfcmp.clt.s VT0, VX0, VM0
addi.d I, I, -1
xvbitsel.v VM0, VM0, VX0, VT0
xvbitsel.v VI0, VI0, VI1, VT0
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
.L15:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x2, x1
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x4, x3
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM1, VM0
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, x1, VM0
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvadd.w VI1, VI1, VINC8
xvfcmp.clt.s VT0, VX0, VM0
addi.d I, I, -1
xvbitsel.v VM0, VM0, VX0, VT0
xvbitsel.v VI0, VI0, VI1, VT0
blt $r0, I, .L24
.align 3
.L25:
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x2, x1
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x4, x3
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM1, VM0
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
fmov.s $f16, $f20
.align 3
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfcmp.clt.s VT0, x2, x1
xvbitsel.v x1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x4, x3
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, x1, VM0
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
xvfcmp.ceq.s VT0, VM0, x1
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3
.L262:
xvfcmp.ceq.s VT0, VM0, x2
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L272
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3
.L272:
xvfcmp.ceq.s VT0, VM0, x3
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L282
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3
.L282:
xvfcmp.ceq.s VT0, VM0, x4
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L292
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3
.L292:
fcmp.clt.s $fcc0, $f13, $f15
fsel $f15, $f15, $f13, $fcc0
fsel $f20, $f20, $f16, $fcc0
movfr2gr.s i0, $f20
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
addi.d I, I, -1
fcmp.clt.s $fcc0, $f9, $f15
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d i1, i1, 1
movgr2fr.d $f21, i1
add.d X, X, INCX
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,271 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define t1 $r13
#define t2 $r15
#define t3 $r18
#define t4 $r16
#define i0 $r17
#define i1 $r14
#define TEMP $r19
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr13
#define VX1 $vr14
#define VM0 $vr15
#define VM1 $vr16
#define VINC4 $vr17
#define VINC8 $vr18
#define VI0 $vr20
#define VI1 $vr21
#define VI2 $vr22
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
addi.w i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L10:
vld VX0, X, 0 * SIZE
vadd.w VI1, VI1, VINC8
vld VX1, X, 4 * SIZE
vadd.w VI2, VI1, VINC4
vfcmp.clt.s VT0, VX1, VX0
addi.d I, I, -1
vbitsel.v VM1, VX0, VX1, VT0
vbitsel.v VI2, VI1, VI2, VT0
vfcmp.clt.s VT0, VM1, VM0
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L10
.align 3
.L15:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfcmp.clt.s VT0, x2, x1
vbitsel.v VM1, x1, x2, VT0
vbitsel.v VINC4, VI1, VI2, VT0
vfcmp.clt.s VT0, x4, x3
vbitsel.v VM0, x3, x4, VT0
vbitsel.v VINC8, VI3, VI4, VT0
vfcmp.clt.s VT0, VM1, VM0
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, x1, VM0
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L26
.align 3
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L21
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
vreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
vinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 5
vinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
vinsgr2vr.w VI0, i0, 3 //4
.align 3
.L24:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vadd.w VI2, VI1, VINC4
vfcmp.clt.s VT0, VX1, VX0
addi.d I, I, -1
vbitsel.v VM1, VX0, VX1, VT0
vbitsel.v VI2, VI1, VI2, VT0
vfcmp.clt.s VT0, VM1, VM0
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VI0, VI2, VT0
blt $r0, I, .L24
.align 3
.L25:
vreplvei.w VI1, VI0, 0
vreplvei.w VI2, VI0, 1
vreplvei.w VI3, VI0, 2
vreplvei.w VI4, VI0, 3
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfcmp.clt.s VT0, x2, x1
vbitsel.v VM1, x1, x2, VT0
vbitsel.v VINC4, VI1, VI2, VT0
vfcmp.clt.s VT0, x4, x3
vbitsel.v VM0, x3, x4, VT0
vbitsel.v VINC8, VI3, VI4, VT0
vfcmp.clt.s VT0, VM1, VM0
vbitsel.v VM0, VM0, VM1, VT0
vbitsel.v VI0, VINC8, VINC4, VT0
li.d TEMP, 1 //
movgr2fr.w $f17, TEMP
ffint.s.w $f17, $f17
vfcmp.ceq.s VT0, x1, VM0
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
.align 3
.L26:
vfcmp.ceq.s VT0, x2, VM0
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L27
vfcmp.clt.s VT0, VI2, VI0
vbitsel.v VI0, VI0, VI2, VT0
.align 3
.L27:
vfcmp.ceq.s VT0, x3, VM0
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L28
vfcmp.clt.s VT0, VI3, VI0
vbitsel.v VI0, VI0, VI3, VT0
.align 3
.L28:
vfcmp.ceq.s VT0, x4, VM0
fcmp.ceq.s $fcc0, $f23, $f17
bceqz $fcc0, .L29
vfcmp.clt.s VT0, VI4, VI0
vbitsel.v VI0, VI0, VI4, VT0
.align 3
.L29:
movfr2gr.s i0, $f20
.L21: //N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3
.L22:
fld.d $f9, X, 0
fcmp.clt.s $fcc0, $f9, $f15
fsel $f15, $f15, $f9, $fcc0
fsel $f20, $f20, $f21, $fcc0
addi.d I, I, -1
addi.d i1, i1, 1
add.d X, X, INCX
movgr2fr.d $f21, i1
blt $r0, I, .L22
movfr2gr.s i0, $f20
.align 3
.L999:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,208 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define x5 $xr13
#define x6 $xr14
#define x7 $xr15
#define x8 $xr16
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr19
#define VM3 $xr18
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvfmaxa.s VM0, VM0, VX0
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
.L11:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmaxa.s VM3, x1, x2
xvfmaxa.s VM2, x3, x4
xvfmaxa.s VM1, x5, x6
xvfmaxa.s VM0, x7, x8
xvfmaxa.s VM2, VM2, VM3
xvfmaxa.s VM0, VM0, VM1
xvfmaxa.s VM0, VM0, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmaxa.s m0, VX0, VX1 //patial repeat read
xvpickve.w x1, m0, 0
xvpickve.w x2, m0, 1
xvpickve.w x3, m0, 2
xvpickve.w x4, m0, 3
xvfmaxa.s m0, x1, x2
xvfmaxa.s VM1, x3, x4
xvfmaxa.s m0, m0, VM1
xvfmaxa.s VM0, m0, VM0
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
addi.d I, I, -1
xvfmaxa.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the maxa value
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
addi.d I, I, -1
xvfmaxa.s VM0, VM0, VX0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmaxa.s VM3, x1, x2
xvfmaxa.s VM2, x3, x4
xvfmaxa.s VM1, x5, x6
xvfmaxa.s VM0, x7, x8
xvfmaxa.s VM2, VM2, VM3
xvfmaxa.s VM0, VM0, VM1
xvfmaxa.s VM0, VM0, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
addi.d I, I, -1
xvfmaxa.s VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,177 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $vr8
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
addi.d I, I, -1
vfmaxa.s VM1, VX0, VX1
addi.d X, X, 8 * SIZE
vfmaxa.s VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfmaxa.s VM0, x3, x4
vfmaxa.s VM0, VM0, VM1
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
vld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
vldx VX1, X, I
vfmaxa.s m0, VX0, VX1 //patial repeat read
vreplvei.w x1, m0, 0
vreplvei.w x2, m0, 1
vreplvei.w x3, m0, 2
vreplvei.w x4, m0, 3
vfmaxa.s m0, x1, x2
vfmaxa.s VM1, x3, x4
vfmaxa.s m0, m0, VM1
vfmaxa.s VM0, m0, VM0
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
vld x1, X, 0
addi.d I, I, -1
vfmaxa.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
addi.d I, I, -1
vfmaxa.s VM1, VX0, VX1
vfmaxa.s VM0, VM0, VM1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfmaxa.s VM0, x3, x4
vfmaxa.s VM0, VM0, VM1
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
addi.d I, I, -1
vfmaxa.s VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,208 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define x5 $xr13
#define x6 $xr14
#define x7 $xr15
#define x8 $xr16
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr18
#define VM3 $xr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvfmina.s VM0, VM0, VX0
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
.L11:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmina.s VM3, x1, x2
xvfmina.s VM2, x3, x4
xvfmina.s VM1, x5, x6
xvfmina.s VM0, x7, x8
xvfmina.s VM2, VM2, VM3
xvfmina.s VM0, VM0, VM1
xvfmina.s VM0, VM0, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmina.s m0, VX0, VX1 //patial repeat read
xvpickve.w x1, m0, 0
xvpickve.w x2, m0, 1
xvpickve.w x3, m0, 2
xvpickve.w x4, m0, 3
xvfmina.s m0, x1, x2
xvfmina.s VM1, x3, x4
xvfmina.s m0, m0, VM1
xvfmina.s VM0, m0, VM0
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
addi.d I, I, -1
xvfmina.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the mina value
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
addi.d I, I, -1
xvfmina.s VM0, VM0, VX0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmina.s VM3, x1, x2
xvfmina.s VM2, x3, x4
xvfmina.s VM1, x5, x6
xvfmina.s VM0, x7, x8
xvfmina.s VM2, VM2, VM3
xvfmina.s VM0, VM0, VM1
xvfmina.s VM0, VM0, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
addi.d I, I, -1
xvfmina.s VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,177 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $vr8
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
addi.d I, I, -1
vfmina.s VM1, VX0, VX1
addi.d X, X, 8 * SIZE
vfmina.s VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmina.s VM1, x1, x2
vfmina.s VM0, x3, x4
vfmina.s VM0, VM0, VM1
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
vld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
vldx VX1, X, I
vfmina.s m0, VX0, VX1 //patial repeat read
vreplvei.w x1, m0, 0
vreplvei.w x2, m0, 1
vreplvei.w x3, m0, 2
vreplvei.w x4, m0, 3
vfmina.s m0, x1, x2
vfmina.s VM1, x3, x4
vfmina.s m0, m0, VM1
vfmina.s VM0, m0, VM0
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
vld x1, X, 0
addi.d I, I, -1
vfmina.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
addi.d I, I, -1
vfmina.s VM1, VX0, VX1
vfmina.s VM0, VM0, VM1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmina.s VM1, x1, x2
vfmina.s VM0, x3, x4
vfmina.s VM0, VM0, VM1
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
addi.d I, I, -1
vfmina.s VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fabs.s $f22, $f22
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,157 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
#define VT0 $xr23
#define VT1 $xr22
#define res1 $xr16
#define res2 $xr17
#define res0 $xr18
#define neg1 $xr19
PROLOGUE
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
xvxor.v res0, res0, res0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.w t1, -1
xvreplgr2vr.w neg1, t1
xvffint.s.w neg1, neg1
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
xvld VX0, X, 0 * SIZE
xvfmul.s VX2, neg1, VX0
xvfcmp.clt.s VT0, VX0, res0
xvbitsel.v VX0, VX0, VX2, VT0
xvfadd.s res1, VX0, res1
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.s $f12, X, 0 * SIZE
fabs.s $f12, $f12
fadd.s $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VX2, neg1, VX0
xvfcmp.clt.s VT0, VX0, res0
xvbitsel.v VX0, VX0, VX2, VT0
xvfadd.s res1, VX0, res1
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.s $f12, X, 0 * SIZE
fabs.s $f12, $f12
fadd.s $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,148 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
#define VT0 $vr23
#define VT1 $vr22
#define res1 $vr16
#define res2 $vr17
#define res0 $vr18
#define neg1 $vr19
PROLOGUE
vxor.v res1, res1, res1
vxor.v res2, res2, res2
vxor.v res0, res0, res0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.w t1, -1
vreplgr2vr.w neg1, t1
vffint.s.w neg1, neg1
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VX2, neg1, VX0
vfmul.s VX3, neg1, VX1
vfcmp.clt.s VT0, VX0, res0
vfcmp.clt.s VT1, VX1, res0
vbitsel.v VX0, VX0, VX2, VT0
vbitsel.v VX1, VX1, VX3, VT1
vfadd.s res2, VX0, VX1
vfadd.s res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.s $f12, X, 0 * SIZE
fabs.s $f12, $f12
fadd.s $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vfmul.s VX2, neg1, VX0
vfmul.s VX3, neg1, VX1
vfcmp.clt.s VT0, VX0, res0
vfcmp.clt.s VT1, VX1, res0
vbitsel.v VX0, VX0, VX2, VT0
vbitsel.v VX1, VX1, VX3, VT1
vfadd.s res2, VX0, VX1
vfadd.s res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.s $f12, X, 0 * SIZE
fabs.s $f12, $f12
fadd.s $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,597 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r5
#define INCX $r6
#define BETA $f1
#define Y $r7
#define INCY $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define a2 $f13
#define VX0 $xr8
#define VX1 $xr20
#define VX2 $xr21
#define VX3 $xr22
#define VXA $xr23
#define VXB $xr9
#define VXZ $xr19
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.s t1, ALPHA
xvreplgr2vr.w VXA, t1
movfr2gr.s t2, BETA
xvreplgr2vr.w VXB, t2
movfr2gr.s t3, a1
xvreplgr2vr.w VXZ, t3
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L110
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
b .L111 // ALPHA!=0 BETA!=0
.align 3
.L110:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L114 // ALPHA==0 BETA==0
b .L113 // ALPHA==0 BETA!=0
.align 3
.L111: // ALPHA!=0 BETA!=0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvfmul.s VX0, VX0, VXA
addi.d I, I, -1
xvfmadd.s VX2, VX2, VXB, VX0
xvst VX2, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
b .L997
.align 3
.L112: // ALPHA!=0 BETA==0
xvld VX0, X, 0 * SIZE
xvfmul.s VX0, VX0, VXA
addi.d I, I, -1
xvst VX0, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L112
b .L997
.align 3
.L113: // ALPHA==0 BETA!=0
xvld VX2, Y, 0 * SIZE
xvfmul.s VX2, VX2, VXB
addi.d I, I, -1
xvst VX2, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L113
b .L997
.align 3
.L114: // ALPHA==0 BETA==0
xvst VXZ, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L120
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
b .L121 // ALPHA!=0 BETA!=0
.align 3
.L120:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L124 // ALPHA==0 BETA==0
b .L123 // ALPHA==0 BETA!=0
.align 3
.L121: // ALPHA!=0 BETA!=0
xvld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VX0, VX0, VXA
xvfmadd.s VX2, VX2, VXB, VX0
xvstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 7
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
b .L997
.align 3
.L122: // ALPHA!=0 BETA==0
xvld VX0, X, 0 * SIZE
xvfmul.s VX0, VX0, VXA
addi.d I, I, -1
xvstelm.w VX0, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 7
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L122
b .L997
.align 3
.L123: // ALPHA==0 BETA!=0
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VX2, VX2, VXB
xvstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L123
b .L997
.align 3
.L124: // ALPHA==0 BETA==0
xvstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L124
b .L997
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L210
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
b .L211 // ALPHA!=0 BETA!=0
.align 3
.L210:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L214 // ALPHA==0 BETA==0
b .L213 // ALPHA==0 BETA!=0
.align 3
.L211: // ALPHA!=0 BETA!=0
xvld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfmul.s VX0, VXA, VX0
xvfmadd.s VX2, VX2, VXB, VX0
addi.d I, I, -1
xvst VX2, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
b .L997
.align 3
.L212: // ALPHA!=0 BETA==0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfmul.s VX0, VXA, VX0
addi.d I, I, -1
xvst VX0, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L212
b .L997
.align 3
.L213: // ALPHA==0 BETA!=0
xvld VX2, Y, 0 * SIZE
xvfmul.s VX2, VX2, VXB
xvst VX2, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L213
b .L997
.align 3
.L214: // ALPHA==0 BETA==0
xvst VXZ, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L214
b .L997
.align 3
.L22:
bge $r0, I, .L997
move YY, Y
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L220
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
b .L221 // ALPHA!=0 BETA!=0
.align 3
.L220:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L224 // ALPHA==0 BETA==0
b .L223 // ALPHA==0 BETA!=0
.align 3
.L221: // ALPHA!=0 BETA!=0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VX0, VX0, VXA
xvfmadd.s VX2, VX2, VXB, VX0
addi.d I, I, -1
xvstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 7
add.d YY, YY, INCY
blt $r0, I, .L221
b .L997
.align 3
.L222: // ALPHA!=0 BETA==0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfmul.s VX0, VX0, VXA
addi.d I, I, -1
xvstelm.w VX0, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX0, YY, 0, 7
add.d YY, YY, INCY
blt $r0, I, .L222
b .L997
.align 3
.L223: // ALPHA==0 BETA!=0
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VX2, VX2, VXB
addi.d I, I, -1
xvstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 7
add.d YY, YY, INCY
blt $r0, I, .L223
b .L997
.align 3
.L224: // ALPHA==0 BETA==0
xvstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.s $f12, X, 0 * SIZE
fld.s $f13, Y, 0 * SIZE
addi.d I, I, -1
fmul.s $f12, $f12, ALPHA
fmadd.s $f13, $f13, BETA, $f12
fst.s $f13, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,629 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r5
#define INCX $r6
#define BETA $f1
#define Y $r7
#define INCY $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define a2 $f13
#define VX0 $vr8
#define VX1 $vr20
#define VX2 $vr21
#define VX3 $vr22
#define VXA $vr23
#define VXB $vr9
#define VXZ $vr19
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.s t1, ALPHA
vreplgr2vr.w VXA, t1
movfr2gr.s t2, BETA
vreplgr2vr.w VXB, t2
movfr2gr.s t3, a1
vreplgr2vr.w VXZ, t3
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L110
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
b .L111 // ALPHA!=0 BETA!=0
.align 3
.L110:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L114 // ALPHA==0 BETA==0
b .L113 // ALPHA==0 BETA!=0
.align 3
.L111: // ALPHA!=0 BETA!=0
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vfmul.s VX0, VX0, VXA
vfmul.s VX1, VX1, VXA
vfmadd.s VX2, VX2, VXB, VX0
vfmadd.s VX3, VX3, VXB, VX1
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3
.L112: // ALPHA!=0 BETA==0
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VX0, VX0, VXA
vfmul.s VX1, VX1, VXA
vst VX0, Y, 0 * SIZE
vst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
.L113: // ALPHA==0 BETA!=0
vld VX2, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
vfmul.s VX2, VX2, VXB
vfmul.s VX3, VX3, VXB
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
.L114: // ALPHA==0 BETA==0
vst VXZ, Y, 0 * SIZE
vst VXZ, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L120
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
b .L121 // ALPHA!=0 BETA!=0
.align 3
.L120:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L124 // ALPHA==0 BETA==0
b .L123 // ALPHA==0 BETA!=0
.align 3
.L121: // ALPHA!=0 BETA!=0
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VX0, VX0, VXA
vld VX1, X, 4 * SIZE
vfmadd.s VX2, VX2, VXB, VX0
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
vstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
vfmul.s VX1, VX1, VXA
vfmadd.s VX3, VX3, VXB, VX1
addi.d I, I, -1
vstelm.w VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
b .L997
.align 3
.L122: // ALPHA!=0 BETA==0
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VX0, VX0, VXA
vfmul.s VX1, VX1, VXA
vstelm.w VX0, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX0, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX0, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX0, YY, 0, 3
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L122
b .L997
.align 3
.L123: // ALPHA==0 BETA!=0
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VX2, VX2, VXB
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
vstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
vfmul.s VX3, VX3, VXB
addi.d I, I, -1
vstelm.w VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L123
b .L997
.align 3
.L124: // ALPHA==0 BETA==0
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L124
b .L997
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L210
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
b .L211 // ALPHA!=0 BETA!=0
.align 3
.L210:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L214 // ALPHA==0 BETA==0
b .L213 // ALPHA==0 BETA!=0
.align 3
.L211: // ALPHA!=0 BETA!=0
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfmul.s VX0, VXA, VX0
vld VX3, Y, 4 * SIZE
vfmadd.s VX2, VX2, VXB, VX0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vst VX2, Y, 0 * SIZE
vfmul.s VX1, VX1, VXA
vfmadd.s VX3, VX3, VXB, VX1
addi.d I, I, -1
vst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
b .L997
.align 3
.L212: // ALPHA!=0 BETA==0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfmul.s VX0, VXA, VX0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vst VX0, Y, 0 * SIZE
vfmul.s VX1, VX1, VXA
addi.d I, I, -1
vst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L212
b .L997
.align 3
.L213: // ALPHA==0 BETA!=0
vld VX2, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
vfmul.s VX2, VX2, VXB
vfmul.s VX3, VX3, VXB
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L213
b .L997
.align 3
.L214: // ALPHA==0 BETA==0
vst VXZ, Y, 0 * SIZE
vst VXZ, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L214
b .L997
.align 3
.L22:
bge $r0, I, .L997
move YY, Y
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L220
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
b .L221 // ALPHA!=0 BETA!=0
.align 3
.L220:
fcmp.ceq.s $fcc0, BETA, a1
bcnez $fcc0, .L224 // ALPHA==0 BETA==0
b .L223 // ALPHA==0 BETA!=0
.align 3
.L221: // ALPHA!=0 BETA!=0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VX0, VX0, VXA
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vfmadd.s VX2, VX2, VXB, VX0
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmul.s VX1, VX1, VXA
addi.d I, I, -1
vfmadd.s VX3, VX3, VXB, VX1
vstelm.w VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L221
b .L997
.align 3
.L222: // ALPHA!=0 BETA==0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfmul.s VX0, VX0, VXA
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vstelm.w VX0, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX0, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX0, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX0, YY, 0, 3
add.d YY, YY, INCY
vfmul.s VX1, VX1, VXA
addi.d I, I, -1
vstelm.w VX1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L222
b .L997
.align 3
.L223: // ALPHA==0 BETA!=0
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VX2, VX2, VXB
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
vstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
vfmul.s VX3, VX3, VXB
addi.d I, I, -1
vstelm.w VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L223
b .L997
.align 3
.L224: // ALPHA==0 BETA==0
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.s $f12, X, 0 * SIZE
fld.s $f13, Y, 0 * SIZE
addi.d I, I, -1
fmul.s $f12, $f12, ALPHA
fmadd.s $f13, $f13, BETA, $f12
fst.s $f13, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,323 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define XX $r5
#define YY $r6
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $xr8
#define VX1 $xr20
#define VX2 $xr21
#define VX3 $xr22
#define VXA $xr23
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
movgr2fr.d a2, TEMP
ffint.s.l a2, a2
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L999
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.s t1, ALPHA
xvreplgr2vr.w VXA, t1
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L113
fcmp.ceq.s $fcc0, ALPHA, a2
bceqz $fcc0, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
addi.d I, I, -1
xvfadd.s VX2, VX0, VX2
xvst VX2, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
b .L113
.align 3
.L112:
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
addi.d I, I, -1
xvfmadd.s VX2, VX0, VXA, VX2
xvst VX2, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L112
.align 3
.L113:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L114:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L114
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
move YY, Y
.align 3
.L121:
xvld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmadd.s VX2, VX0, VXA, VX2
addi.d I, I, -1
xvstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 7
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L212
.align 3
.L211:
xvld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmadd.s VX2, VX0, VXA, VX2
addi.d I, I, -1
xvst VX2, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
move YY, Y
.align 3
.L222:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmadd.s VX2, VX0, VXA, VX2
addi.d I, I, -1
xvstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VX2, YY, 0, 7
add.d YY, YY, INCY
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,338 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define XX $r5
#define YY $r6
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $vr8
#define VX1 $vr20
#define VX2 $vr21
#define VX3 $vr22
#define VXA $vr23
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
movgr2fr.d a2, TEMP
ffint.s.l a2, a2
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L999
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.s t1, ALPHA
vreplgr2vr.w VXA, t1
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L113
fcmp.ceq.s $fcc0, ALPHA, a2
bceqz $fcc0, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vfadd.s VX2, VX0, VX2
vfadd.s VX3, VX1, VX3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L113
.align 3
.L112:
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vfmadd.s VX2, VX0, VXA, VX2
vfmadd.s VX3, VX1, VXA, VX3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L113
.align 3
.L113:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L114:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L114
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
move YY, Y
.align 3
.L121:
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmadd.s VX2, VX0, VXA, VX2
vld VX1, X, 4 * SIZE
vstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmadd.s VX3, VX1, VXA, VX3
addi.d I, I, -1
vstelm.w VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L212
.align 3
.L211:
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfmadd.s VX2, VX0, VXA, VX2
vld VX3, Y, 4 * SIZE
vst VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
vfmadd.s VX3, VX1, VXA, VX3
addi.d I, I, -1
vst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
move YY, Y
.align 3
.L222:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmadd.s VX2, VX0, VXA, VX2
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vstelm.w VX2, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX2, YY, 0, 3
add.d YY, YY, INCY
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmadd.s VX3, VX1, VXA, VX3
addi.d I, I, -1
vstelm.w VX3, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VX3, YY, 0, 3
add.d YY, YY, INCY
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fmadd.s $f14, $f12, $f0, $f14
fst.s $f14, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,216 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $xr12
#define VX1 $xr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvst VX0, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
xvstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 3
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 4
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 5
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 6
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 7
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvst VX0, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,220 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $vr12
#define VX1 $vr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
addi.d I, I, -1
vst VX0, Y, 0 * SIZE
vst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 3
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 1
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 2
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 3
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vst VX0, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,205 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define x5 $xr13
#define x6 $xr14
#define x7 $xr15
#define x8 $xr16
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr18
#define VM3 $xr17
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvfmax.s VM0, VM0, VX0
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
.L11:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmax.s VM3, x1, x2
xvfmax.s VM2, x3, x4
xvfmax.s VM1, x5, x6
xvfmax.s VM0, x7, x8
xvfmax.s VM2, VM2, VM3
xvfmax.s VM0, VM0, VM1
xvfmax.s VM0, VM0, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmax.s m0, VX0, VX1 //patial repeat read
xvpickve.w x1, m0, 0
xvpickve.w x2, m0, 1
xvpickve.w x3, m0, 2
xvpickve.w x4, m0, 3
xvfmax.s m0, x1, x2
xvfmax.s VM1, x3, x4
xvfmax.s m0, m0, VM1
xvfmax.s VM0, m0, VM0
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
addi.d I, I, -1
xvfmax.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the max value
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
addi.d I, I, -1
xvfmax.s VM0, VM0, VX0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmax.s VM3, x1, x2
xvfmax.s VM2, x3, x4
xvfmax.s VM1, x5, x6
xvfmax.s VM0, x7, x8
xvfmax.s VM2, VM2, VM3
xvfmax.s VM0, VM0, VM1
xvfmax.s VM0, VM0, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
addi.d I, I, -1
xvfmax.s VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,171 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $vr8
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
addi.d I, I, -1
vfmax.s VM1, VX0, VX1
addi.d X, X, 8 * SIZE
vfmax.s VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmax.s VM1, x2, x3
vfmax.s VM0, x4, VM0
vfmax.s VM0, VM0, VM1
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
vld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
vldx VX1, X, I
vfmax.s m0, VX0, VX1 //patial repeat read
vreplvei.w x2, m0, 1
vreplvei.w x3, m0, 2
vreplvei.w x4, m0, 3
vfmax.s x1, x2, x3
vfmax.s VM1, x4, m0
vfmax.s m0, x1, VM1
vfmax.s VM0, m0, VM0
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
vld x1, X, 0
addi.d I, I, -1
vfmax.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
addi.d I, I, -1
vfmax.s VM1, VX0, VX1
vfmax.s VM0, VM0, VM1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmax.s VM1, x2, x3
vfmax.s VM0, x4, VM0
vfmax.s VM0, VM0, VM1
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
addi.d I, I, -1
vfmax.s VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,205 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $xr8
#define x1 $xr9
#define x2 $xr10
#define x3 $xr11
#define x4 $xr12
#define x5 $xr13
#define x6 $xr14
#define x7 $xr15
#define x8 $xr16
#define VX0 $xr20
#define VX1 $xr21
#define VM0 $xr22
#define VM1 $xr23
#define VM2 $xr18
#define VM3 $xr19
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvfmin.s VM0, VM0, VX0
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
.L11:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmin.s VM3, x1, x2
xvfmin.s VM2, x3, x4
xvfmin.s VM1, x5, x6
xvfmin.s VM0, x7, x8
xvfmin.s VM2, VM2, VM3
xvfmin.s VM0, VM0, VM1
xvfmin.s VM0, VM0, VM2
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
xvld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
xvldx VX1, X, I
xvfmin.s m0, VX0, VX1 //patial repeat read
xvpickve.w x1, m0, 0
xvpickve.w x2, m0, 1
xvpickve.w x3, m0, 2
xvpickve.w x4, m0, 3
xvfmin.s m0, x1, x2
xvfmin.s VM1, x3, x4
xvfmin.s m0, m0, VM1
xvfmin.s VM0, m0, VM0
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
xvld x1, X, 0
addi.d I, I, -1
xvfmin.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the min value
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
addi.d I, I, -1
xvfmin.s VM0, VM0, VX0
blt $r0, I, .L21
.align 3
.L22:
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvpickve.w x5, VM0, 4
xvpickve.w x6, VM0, 5
xvpickve.w x7, VM0, 6
xvpickve.w x8, VM0, 7
xvfmin.s VM3, x1, x2
xvfmin.s VM2, x3, x4
xvfmin.s VM1, x5, x6
xvfmin.s VM0, x7, x8
xvfmin.s VM2, VM2, VM3
xvfmin.s VM0, VM0, VM1
xvfmin.s VM0, VM0, VM2
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
xvld x1, X, 0
addi.d I, I, -1
xvfmin.s VM0, VM0, x1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,174 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r12
#define J $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define TEMP $r16
#define m0 $vr8
#define x1 $vr9
#define x2 $vr10
#define x3 $vr11
#define x4 $vr12
#define VX0 $vr20
#define VX1 $vr21
#define VM0 $vr22
#define VM1 $vr23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
srai.d I, N, 3
bge $r0, I, .L12
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
addi.d I, I, -1
vfmin.s VM1, VX0, VX1
addi.d X, X, 8 * SIZE
vfmin.s VM0, VM0, VM1
blt $r0, I, .L10
.align 3
.L11:
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmin.s VM1, x1, x2
vfmin.s VM0, x3, x4
vfmin.s VM0, VM0, VM1
.align 3
.L12: //INCX==1 and N<8
andi I, N, 7
li.d J, 4
bge J, I, .L13 // 4<N<8
vld VX0, X, 0
slli.d J, J, 1 // 8
sub.d I, J, I
slli.d I, I, BASE_SHIFT
vldx VX1, X, I
vfmin.s m0, VX0, VX1 //patial repeat read
vreplvei.w x1, m0, 0
vreplvei.w x2, m0, 1
vreplvei.w x3, m0, 2
vreplvei.w x4, m0, 3
vfmin.s m0, x1, x2
vfmin.s VM1, x3, x4
vfmin.s m0, m0, VM1
vfmin.s VM0, m0, VM0
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L13: //INCX==1 and 0<=N<=4
bge $r0, I, .L15
.align 3
.L14:
vld x1, X, 0
addi.d I, I, -1
vfmin.s VM0, VM0, x1
addi.d X, X, SIZE
blt $r0, I, .L14
.align 3
.L15:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20: // INCX!=1
move TEMP, X
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L23
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.w VM0, t2, 1
vinsgr2vr.w VM0, t3, 2
vinsgr2vr.w VM0, t4, 3
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vfmin.s VM1, VX0, VX1
addi.d I, I, -1
vfmin.s VM0, VM0, VM1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.w x1, VM0, 0
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmin.s VM1, x1, x2
vfmin.s VM0, x3, x4
vfmin.s VM0, VM0, VM1
.align 3
.L23: //INCX!=1 and N<8
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
vld x1, X, 0
vfmin.s VM0, VM0, x1
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f22
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,143 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r12
#define t2 $r13
#define t3 $r14
#define t4 $r15
#define VX0 $xr15
#define VX1 $xr16
#define VX2 $xr17
#define VX3 $xr18
#define res1 $xr19
#define res2 $xr20
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L997
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 0 * SIZE
xvfcvtl.d.s VX0, VX0
xvfcvth.d.s VX1, VX1
xvfmadd.d res1, VX0, VX0, res1
xvfmadd.d res2, VX1, VX1, res2
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
b .L996
.L20:
bge $r0, I, .L997
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX1, t1, 0
xvinsgr2vr.w VX1, t2, 1
xvinsgr2vr.w VX1, t3, 2
xvinsgr2vr.w VX1, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX1, t1, 4
xvinsgr2vr.w VX1, t2, 5
xvinsgr2vr.w VX1, t3, 6
xvinsgr2vr.w VX1, t4, 7
xvfcvtl.d.s VX0, VX0
xvfcvth.d.s VX1, VX1
xvfmadd.d res1, VX0, VX0, res1
xvfmadd.d res2, VX1, VX1, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.L996:
xvfadd.d res1, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.s $f15, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
add.d X, X, INCX
blt $r0, I, .L998
.align 3
.L999:
fsqrt.d $f19, $f19
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,156 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r12
#define t2 $r13
#define t3 $r14
#define t4 $r15
#define VX0 $vr15
#define VX1 $vr16
#define VX2 $vr17
#define VX3 $vr18
#define res1 $vr19
#define res2 $vr20
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L997
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 0 * SIZE
vfcvtl.d.s VX0, VX0
vfcvth.d.s VX1, VX1
vfmadd.d res1, VX0, VX0, res1
vfmadd.d res2, VX1, VX1, res2
vld VX2, X, 4 * SIZE
vld VX3, X, 4 * SIZE
vfcvtl.d.s VX2, VX2
vfcvth.d.s VX3, VX3
vfmadd.d res1, VX2, VX2, res1
vfmadd.d res2, VX3, VX3, res2
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
b .L996
.align 3
.L20:
bge $r0, I, .L997
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vfcvtl.d.s VX0, VX0
vfcvth.d.s VX1, VX1
vfmadd.d res1, VX0, VX0, res1
vfmadd.d res2, VX1, VX1, res2
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
vfcvtl.d.s VX2, VX2
vfcvth.d.s VX3, VX3
vfmadd.d res1, VX2, VX2, res1
vfmadd.d res2, VX3, VX3, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3
.L996:
vfadd.d res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.s $f15, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
add.d X, X, INCX
blt $r0, I, .L998
.align 3
.L999:
fsqrt.d $f19, $f19
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,863 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define C $f0
#define S $f1
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define VX0 $xr8
#define VX1 $xr20
#define VX2 $xr21
#define VX3 $xr22
#define VT0 $xr10
#define VT1 $xr18
#define VXC $xr23
#define VXS $xr9
#define VXZ $xr19
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.s t1, C
xvreplgr2vr.w VXC, t1
movfr2gr.s t2, S
xvreplgr2vr.w VXS, t2
movfr2gr.s t3, a1
xvreplgr2vr.w VXZ, t3
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L110
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L112 // C!=0 S==0
b .L111 // C!=0 S!=0
.align 3
.L110:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L114 // C==0 S==0
b .L113 // C==0 S!=0
.align 3
.L111: // C!=0 S!=0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvfmul.s VT0, VX0, VXC
xvfmadd.s VT0, VX2, VXS, VT0
xvfmul.s VT1, VX0, VXS
xvfmsub.s VT1, VX2, VXC, VT1
xvst VT0, X, 0 * SIZE
xvst VT1, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3
.L112: // C!=0 S==0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvfmul.s VT0, VX0, VXC
xvfmul.s VT1, VX2, VXC
xvst VT0, X, 0 * SIZE
xvst VT1, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
.L113: // C==0 S!=0
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
xvfmul.s VT0, VX2, VXS
xvfmul.s VT1, VX0, VXS
xvfsub.s VT1, VXZ, VT1
xvst VT0, X, 0 * SIZE
xvst VT1, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
.L114: // C==0 S==0
xvst VXZ, X, 0 * SIZE
xvst VXZ, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
move XX, X
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L120
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L122 // C!=0 S==0
b .L121 // C!=0 S!=0
.align 3
.L120:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L124 // C==0 S==0
b .L123 // C==0 S!=0
.align 3
.L121: // C!=0 S!=0
xvld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VT0, VX0, VXC
xvfmadd.s VT0, VX2, VXS, VT0
xvfmul.s VT1, VX0, VXS
xvfmsub.s VT1, VX2, VXC, VT1
xvst VT0, X, 0 * SIZE
xvstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 7
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
b .L997
.align 3
.L122: // C!=0 S==0
xvld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VT0, VX0, VXC
xvfmul.s VT1, VX2, VXC
xvst VT0, X, 0 * SIZE
xvstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 7
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L122
b .L997
.align 3
.L123: // C==0 S!=0
xvld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VT0, VX2, VXS
xvfmul.s VT1, VX0, VXS
xvfsub.s VT1, VXZ, VT1
xvst VT0, X, 0 * SIZE
xvstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 7
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L123
b .L997
.align 3
.L124: // C==0 S==0
xvst VXZ, X, 0 * SIZE
xvstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L124
b .L997
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
move XX, X
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L210
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L212 // C!=0 S==0
b .L211 // C!=0 S!=0
.align 3
.L210:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L214 // C==0 S==0
b .L213 // C==0 S!=0
.align 3
.L211: // C!=0 S!=0
xvld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfmul.s VT0, VXC, VX0
xvfmadd.s VT0, VX2, VXS, VT0
xvfmul.s VT1, VX0, VXS
xvfmsub.s VT1, VX2, VXC, VT1
xvstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 7
add.d XX, XX, INCX
xvst VT1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
b .L997
.align 3
.L212: // C!=0 S==0
xvld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfmul.s VT0, VXC, VX0
xvfmul.s VT1, VX2, VXC
xvstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 7
add.d XX, XX, INCX
xvst VT1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L212
b .L997
.align 3
.L213: // C==0 S!=0
xvld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfmul.s VT0, VXS, VX2
xvfmul.s VT1, VXS, VX0
xvfsub.s VT1, VXZ, VT1
xvstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 7
add.d XX, XX, INCX
xvst VT1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L213
b .L997
.align 3
.L214: // C==0 S==0
xvstelm.w VXZ, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 3
add.d XX, XX, INCX
xvst VT1, Y, 0 * SIZE
xvstelm.w VXZ, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 7
add.d XX, XX, INCX
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
b .L997
.align 3
.L22:
bge $r0, I, .L997
move YY, Y
move XX, X
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L220
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L222 // C!=0 S==0
b .L221 // C!=0 S!=0
.align 3
.L220:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L224 // C==0 S==0
b .L223 // C==0 S!=0
.align 3
.L221: // C!=0 S!=0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VT0, VX0, VXC
xvfmadd.s VT0, VX2, VXS, VT0
xvfmul.s VT1, VX0, VXS
xvfmsub.s VT1, VX2, VXC, VT1
xvstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 7
add.d XX, XX, INCX
xvstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L221
b .L997
.align 3
.L222: // C!=0 S==0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VT0, VX0, VXC
xvfmul.s VT1, VX2, VXC
xvstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 7
add.d XX, XX, INCX
xvstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L222
b .L997
.align 3
.L223: // C==0 S!=0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
add.d Y, Y, INCY
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvfmul.s VT0, VX2, VXS
xvfmul.s VT1, VX0, VXS
xvfsub.s VT1, VXZ, VT1
xvstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 7
add.d XX, XX, INCX
xvstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VT1, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L223
b .L997
.align 3
.L224: // C==0 S==0
xvstelm.w VXZ, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
xvstelm.w VXZ, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VXZ, XX, 0, 7
add.d XX, XX, INCX
xvstelm.w VXZ, YY, 0, 4
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 5
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 6
add.d YY, YY, INCY
xvstelm.w VXZ, YY, 0, 7
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.s $f12, X, 0 * SIZE
fld.s $f13, Y, 0 * SIZE
fmul.s $f10, $f12, C
fmadd.s $f10, $f13, S, $f10
fst.s $f10, X, 0 * SIZE
addi.d I, I, -1
fmul.s $f20, $f12, S
fmsub.s $f20, $f13, C, $f20
fst.s $f20, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,927 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define C $f0
#define S $f1
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define VX0 $vr8
#define VX1 $vr20
#define VX2 $vr21
#define VX3 $vr22
#define VT0 $vr10
#define VT1 $vr18
#define VXC $vr23
#define VXS $vr9
#define VXZ $vr19
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
movfr2gr.s t1, C
vreplgr2vr.w VXC, t1
movfr2gr.s t2, S
vreplgr2vr.w VXS, t2
movfr2gr.s t3, a1
vreplgr2vr.w VXZ, t3
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L110
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L112 // C!=0 S==0
b .L111 // C!=0 S!=0
.align 3
.L110:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L114 // C==0 S==0
b .L113 // C==0 S!=0
.align 3
.L111: // C!=0 S!=0
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vfmul.s VT0, VX0, VXC
vfmadd.s VT0, VX2, VXS, VT0
vfmul.s VT1, VX0, VXS
vfmsub.s VT1, VX2, VXC, VT1
vst VT0, X, 0 * SIZE
vst VT1, Y, 0 * SIZE
vfmul.s VT0, VX1, VXC
vfmadd.s VT0, VX3, VXS, VT0
vfmul.s VT1, VX1, VXS
vfmsub.s VT1, VX3, VXC, VT1
vst VT0, X, 4 * SIZE
vst VT1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3
.L112: // C!=0 S==0
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vfmul.s VT0, VX0, VXC
vfmul.s VT1, VX2, VXC
vst VT0, X, 0 * SIZE
vst VT1, Y, 0 * SIZE
vfmul.s VT0, VX1, VXC
vfmul.s VT1, VX3, VXC
vst VT0, X, 4 * SIZE
vst VT1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
.L113: // C==0 S!=0
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vfmul.s VT0, VX2, VXS
vfmul.s VT1, VX0, VXS
vfsub.s VT1, VXZ, VT1
vst VT0, X, 0 * SIZE
vst VT1, Y, 0 * SIZE
vfmul.s VT0, VX3, VXS
vfmul.s VT1, VX1, VXS
vfsub.s VT1, VXZ, VT1
vst VT0, X, 4 * SIZE
vst VT1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
.L114: // C==0 S==0
vst VXZ, X, 0 * SIZE
vst VXZ, Y, 0 * SIZE
vst VXZ, X, 4 * SIZE
vst VXZ, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
move XX, X
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L120
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L122 // C!=0 S==0
b .L121 // C!=0 S!=0
.align 3
.L120:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L124 // C==0 S==0
b .L123 // C==0 S!=0
.align 3
.L121: // C!=0 S!=0
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX0, VXC
vfmadd.s VT0, VX2, VXS, VT0
vfmul.s VT1, VX0, VXS
vfmsub.s VT1, VX2, VXC, VT1
vst VT0, X, 0 * SIZE
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
vld VX1, X, 4 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX1, VXC
vfmadd.s VT0, VX3, VXS, VT0
vfmul.s VT1, VX1, VXS
vfmsub.s VT1, VX3, VXC, VT1
vst VT0, X, 4 * SIZE
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
b .L997
.align 3
.L122: // C!=0 S==0
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX0, VXC
vfmul.s VT1, VX2, VXC
vst VT0, X, 0 * SIZE
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
vld VX1, X, 4 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX1, VXC
vfmul.s VT1, VX3, VXC
vst VT0, X, 4 * SIZE
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L122
b .L997
.align 3
.L123: // C==0 S!=0
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX2, VXS
vfmul.s VT1, VX0, VXS
vfsub.s VT1, VXZ, VT1
vst VT0, X, 0 * SIZE
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
vld VX1, X, 4 * SIZE
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX3, VXS
vfmul.s VT1, VX1, VXS
vfsub.s VT1, VXZ, VT1
vst VT0, X, 4 * SIZE
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L123
b .L997
.align 3
.L124: // C==0 S==0
vst VXZ, X, 0 * SIZE
vst VXZ, X, 4 * SIZE
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L124
b .L997
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
move XX, X
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L210
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L212 // C!=0 S==0
b .L211 // C!=0 S!=0
.align 3
.L210:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L214 // C==0 S==0
b .L213 // C==0 S!=0
.align 3
.L211: // C!=0 S!=0
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfmul.s VT0, VXC, VX0
vfmadd.s VT0, VX2, VXS, VT0
vfmul.s VT1, VXS, VX0
vfmsub.s VT1, VX2, VXC, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
vfmul.s VT0, VX1, VXC
vfmadd.s VT0, VX3, VXS, VT0
vfmul.s VT1, VX1, VXS
vfmsub.s VT1, VX3, VXC, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
b .L997
.align 3
.L212: // C!=0 S==0
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfmul.s VT0, VXC, VX0
vfmul.s VT1, VX2, VXC
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
vfmul.s VT0, VX1, VXC
vfmul.s VT1, VX3, VXS
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L212
b .L997
.align 3
.L213: // C==0 S!=0
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfmul.s VT0, VXS, VX2
vfmul.s VT1, VXS, VX0
vfsub.s VT1, VXZ, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
vfmul.s VT0, VX3, VXS
vfmul.s VT1, VX1, VXS
vfsub.s VT1, VXZ, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L213
b .L997
.align 3
.L214: // C==0 S==0
vstelm.w VXZ, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 0 * SIZE
vstelm.w VXZ, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 3
add.d XX, XX, INCX
vst VT1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
b .L997
.align 3
.L22:
bge $r0, I, .L997
move YY, Y
move XX, X
fcmp.ceq.s $fcc0, C, a1
bcnez $fcc0, .L220
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L222 // C!=0 S==0
b .L221 // C!=0 S!=0
.align 3
.L220:
fcmp.ceq.s $fcc0, S, a1
bcnez $fcc0, .L224 // C==0 S==0
b .L223 // C==0 S!=0
.align 3
.L221: // C!=0 S!=0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX0, VXC
vfmadd.s VT0, VX2, VXS, VT0
vfmul.s VT1, VX0, VXS
vfmsub.s VT1, VX2, VXC, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX1, VXC
vfmadd.s VT0, VX3, VXS, VT0
vfmul.s VT1, VX0, VXS
vfmsub.s VT1, VX3, VXC, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L221
b .L997
.align 3
.L222: // C!=0 S==0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX0, VXC
vfmul.s VT1, VX2, VXC
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX1, VXC
vfmul.s VT1, VX3, VXC
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L222
b .L997
.align 3
.L223: // C==0 S!=0
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX2, VXS
vfmul.s VT1, VX0, VXS
vfsub.s VT1, VXZ, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
ld.w t1, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vfmul.s VT0, VX3, VXS
vfmul.s VT1, VX0, VXS
vfsub.s VT1, VXZ, VT1
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VT1, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VT1, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L223
b .L997
.align 3
.L224: // C==0 S==0
vstelm.w VXZ, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
vstelm.w VXZ, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 3
add.d XX, XX, INCX
vstelm.w VXZ, YY, 0, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 1
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 2
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
.L997:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L998:
fld.s $f12, X, 0 * SIZE
fld.s $f13, Y, 0 * SIZE
fmul.s $f10, $f12, C
fmadd.s $f10, $f13, S, $f10
fst.s $f10, X, 0 * SIZE
addi.d I, I, -1
fmul.s $f20, $f12, S
fmsub.s $f20, $f13, C, $f20
fst.s $f20, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,188 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define XX $r16
#define VX0 $xr12
#define VX1 $xr13
#define VT0 $xr14
#define VT1 $xr15
#define VALPHA $xr19
#define a1 $f8
#define a2 $f23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
movgr2fr.d a2, TEMP
ffint.s.l a2, a2
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L20 //ALPHA==0
fcmp.ceq.s $fcc0, ALPHA, a2
bcnez $fcc0, .L999 //ALPHA==1 return
srai.d I, N, 3
beq INCX, TEMP, .L30 //ALPHA=0|1 and INCX==1
movfr2gr.s TEMP, ALPHA
xvreplgr2vr.w VALPHA, TEMP
move XX, X
.L10: //ALPHA=0|1 and INCX!=1
bge $r0, I, .L32
.align 3
.L11:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VT0, VX0, VALPHA
xvstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 4
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 5
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 6
add.d XX, XX, INCX
xvstelm.w VT0, XX, 0, 7
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L11
b .L32
.align 3
.L20:
srai.d I, N, 3
beq INCX, TEMP, .L24
bge $r0, I, .L22
.align 3
.L21:
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L23:
fst.s a1, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
jirl $r0, $r1, 0
.align 3
.L24:
bge $r0, I, .L26 /*N<8 INCX==1*/
.align 3
.L25:
xvxor.v VX0, VX0, VX0
xvst VX0, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L25
.align 3
.L26:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L27:
fst.s a1, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L27
jirl $r0, $r1, 0
.align 3
.L30:
bge $r0, I, .L32/*N<8 INCX==1*/
movfr2gr.s TEMP, ALPHA
xvreplgr2vr.w VALPHA , TEMP
.align 3
.L31:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvfmul.s VT0, VX0, VALPHA
xvst VT0, X, 0 * SIZE
addi.d X, X, 8 * SIZE
blt $r0, I, .L31
.align 3
.L32:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L33:
fld.s a1, X, 0 * SIZE
addi.d I, I, -1
fmul.s a1, ALPHA, a1
fst.s a1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L33
jirl $r0, $r1, 0
.align 3
.L999:
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,194 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHA $f0
#define X $r7
#define INCX $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r18
#define t3 $r15
#define t4 $r17
#define XX $r16
#define VX0 $vr12
#define VX1 $vr13
#define VT0 $vr14
#define VT1 $vr15
#define VALPHA $vr19
#define a1 $f8
#define a2 $f23
PROLOGUE
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
movgr2fr.d a1, $r0
ffint.s.l a1, a1
movgr2fr.d a2, TEMP
ffint.s.l a2, a2
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
fcmp.ceq.s $fcc0, ALPHA, a1
bcnez $fcc0, .L20 //ALPHA==0
fcmp.ceq.s $fcc0, ALPHA, a2
bcnez $fcc0, .L999 //ALPHA==1 return
srai.d I, N, 3
beq INCX, TEMP, .L30 //ALPHA=0|1 and INCX==1
movfr2gr.s TEMP, ALPHA
vreplgr2vr.w VALPHA, TEMP
move XX, X
.align 3
.L10: //ALPHA=0|1 and INCX!=1
bge $r0, I, .L32
.align 3
.L11:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
vfmul.s VT0, VX0, VALPHA
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vstelm.w VT0, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT0, XX, 0, 3
add.d XX, XX, INCX
vfmul.s VT1, VX1, VALPHA
vstelm.w VT1, XX, 0, 0
add.d XX, XX, INCX
vstelm.w VT1, XX, 0, 1
add.d XX, XX, INCX
vstelm.w VT1, XX, 0, 2
add.d XX, XX, INCX
vstelm.w VT1, XX, 0, 3
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L11
b .L32
.align 3
.L20:
srai.d I, N, 3
beq INCX, TEMP, .L24
bge $r0, I, .L22
.align 3
.L21:
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
fst.s a1, X, 0
add.d X, X, INCX
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L23:
fst.s a1, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
jirl $r0, $r1, 0
.align 3
.L24:
bge $r0, I, .L26 /*N<8 INCX==1*/
.align 3
.L25:
vxor.v VX0, VX0, VX0
vst VX0, X, 0 * SIZE
vst VX0, X, 4 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L25
.align 3
.L26:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L27:
fst.s a1, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L27
jirl $r0, $r1, 0
.align 3
.L30:
bge $r0, I, .L32/*N<8 INCX==1*/
movfr2gr.s TEMP, ALPHA
vreplgr2vr.w VALPHA , TEMP
.align 3
.L31:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VT0, VX0, VALPHA
vfmul.s VT1, VX1, VALPHA
addi.d I, I, -1
vst VT0, X, 0 * SIZE
vst VT1, X, 4 * SIZE
addi.d X, X, 8 * SIZE
blt $r0, I, .L31
.align 3
.L32:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L33:
fld.s a1, X, 0 * SIZE
addi.d I, I, -1
fmul.s a1, ALPHA, a1
fst.s a1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L33
jirl $r0, $r1, 0
.align 3
.L999:
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,140 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
#define res1 $xr16
#define res2 $xr17
PROLOGUE
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
xvld VX0, X, 0 * SIZE
xvfadd.s res1, VX0, res1
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfadd.s res1, VX0, res1
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,125 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
#define res1 $vr16
#define res2 $vr17
PROLOGUE
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfadd.s res2, VX0, VX1
vfadd.s res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vfadd.s res2, VX0, VX1
vfadd.s res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,286 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
addi.d I, I, -1
xvst VX2, X, 0 * SIZE
xvst VX0, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 3
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
ld.w t1, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 4
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 5
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 6
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 7
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvst VX2, X, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
xvld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
xvstelm.w VX2, X, 0, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
xvstelm.w VX2, X, 0, 1
add.d X, X, INCY
ld.w t3, X, 0 * SIZE
xvstelm.w VX2, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvstelm.w VX2, X, 0, 3
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
ld.w t1, X, 0 * SIZE
xvstelm.w VX2, X, 0, 4
add.d X, X, INCY
ld.w t2, X, 0 * SIZE
xvstelm.w VX2, X, 0, 5
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
xvstelm.w VX2, X, 0, 6
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvstelm.w VX2, X, 0, 7
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvst VX1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,294 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX2, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
addi.d I, I, -1
vst VX2, X, 0 * SIZE
vst VX3, X, 4 * SIZE
vst VX0, Y, 0 * SIZE
vst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 3
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vst VX2, X, 0 * SIZE
vld VX1, X, 4 * SIZE
ld.w t1, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 3
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L212
.align 3
.L211:
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
vstelm.w VX2, X, 0, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
vstelm.w VX2, X, 0, 1
add.d X, X, INCY
ld.w t3, X, 0 * SIZE
vstelm.w VX2, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vstelm.w VX2, X, 0, 3
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vst VX0, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
ld.w t1, X, 0 * SIZE
vstelm.w VX3, X, 0, 0
add.d X, X, INCY
ld.w t2, X, 0 * SIZE
vstelm.w VX3, X, 0, 1
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
vstelm.w VX3, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vstelm.w VX3, X, 0, 3
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
vst VX1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -9,12 +9,12 @@
#endif
#ifndef HAVE_CASUM_KERNEL
static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
{
BLASLONG i=0;
BLASLONG n_8 = n & -8;
FLOAT *x = x1;
FLOAT *x1 = x;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
FLOAT sum4 = 0.0;
while (i < n_8) {
temp0 = ABS_K(x[0]);
temp1 = ABS_K(x[1]);
temp2 = ABS_K(x[2]);
temp3 = ABS_K(x[3]);
temp4 = ABS_K(x[4]);
temp5 = ABS_K(x[5]);
temp6 = ABS_K(x[6]);
temp7 = ABS_K(x[7]);
temp0 = ABS_K(x1[0]);
temp1 = ABS_K(x1[1]);
temp2 = ABS_K(x1[2]);
temp3 = ABS_K(x1[3]);
temp4 = ABS_K(x1[4]);
temp5 = ABS_K(x1[5]);
temp6 = ABS_K(x1[6]);
temp7 = ABS_K(x1[7]);
sum0 += temp0;
sum1 += temp1;
@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
sum2 += temp6;
sum3 += temp7;
x+=8;
x1+=8;
i+=4;
}
while (i < n) {
sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
x1 += 2;
i++;
}

View File

@ -2,9 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
#define HAVE_CASUM_KERNEL 1
@ -20,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
if (n2 < 64) {
__m128 accum_10, accum_11, accum_12, accum_13;
__m128 abs_mask1 = abs_mask1;
__m128 abs_mask1;
accum_10 = _mm_setzero_ps();
accum_11 = _mm_setzero_ps();
accum_12 = _mm_setzero_ps();
accum_13 = _mm_setzero_ps();
abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);
_mm_prefetch(&x1[0], _MM_HINT_T0);

View File

@ -2,9 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
#define HAVE_ZASUM_KERNEL 1
@ -21,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
if (n2 < 32) {
__m128d accum_10, accum_11, accum_12, accum_13;
__m128d abs_mask1 = abs_mask1;
__m128d abs_mask1;
accum_10 = _mm_setzero_pd();
accum_11 = _mm_setzero_pd();
accum_12 = _mm_setzero_pd();
accum_13 = _mm_setzero_pd();
// abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
_mm_prefetch(&x1[0], _MM_HINT_T0);
if (n2 >= 16){

View File

@ -1,9 +1,9 @@
cmake_minimum_required(VERSION 3.2)
cmake_minimum_required(VERSION 3.6)
project(LAPACK Fortran C)
project(LAPACK)
set(LAPACK_MAJOR_VERSION 3)
set(LAPACK_MINOR_VERSION 11)
set(LAPACK_MINOR_VERSION 12)
set(LAPACK_PATCH_VERSION 0)
set(
LAPACK_VERSION
@ -45,6 +45,14 @@ if(_is_coverage_build)
find_package(codecov)
endif()
# Use valgrind if it is found
option( LAPACK_TESTING_USE_PYTHON "Use Python for testing. Disable it on memory checks." ON )
find_program( MEMORYCHECK_COMMAND valgrind )
if( MEMORYCHECK_COMMAND )
message( STATUS "Found valgrind: ${MEMORYCHECK_COMMAND}" )
set( MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=all --track-origins=yes" )
endif()
# By default test Fortran compiler complex abs and complex division
option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF)
if( TEST_FORTRAN_COMPILER )
@ -99,6 +107,8 @@ else()
set(LAPACKELIB "lapacke")
set(TMGLIB "tmglib")
endif()
# By default build standard API and extended _64 API
option(BUILD_INDEX64_EXT_API "Build Index-64 API as extended API with _64 suffix" ON)
include(GNUInstallDirs)
@ -127,90 +137,6 @@ configure_file(
include(PreventInSourceBuilds)
include(PreventInBuildInstalls)
# Check if recursive flag exists
include(CheckFortranCompilerFlag)
if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang)
check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag)
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
check_fortran_compiler_flag("-frecursive" _frecursiveFlag)
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
check_fortran_compiler_flag("-recursive" _recursiveFlag)
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
check_fortran_compiler_flag("-qrecur" _qrecurFlag)
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG)
check_fortran_compiler_flag("-recursive" _recursiveFlag)
else()
message(WARNING "Fortran local arrays should be allocated on the stack."
" Please use a compiler which guarantees that feature."
" See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.")
endif()
# Add recursive flag
if(_MrecursiveFlag)
string(REGEX MATCH "-Mrecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
if(NOT output_test)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive"
CACHE STRING "Recursive flag must be set" FORCE)
endif()
elseif(_frecursiveFlag)
string(REGEX MATCH "-frecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
if(NOT output_test)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive"
CACHE STRING "Recursive flag must be set" FORCE)
endif()
elseif(_recursiveFlag)
string(REGEX MATCH "-recursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
if(NOT output_test)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive"
CACHE STRING "Recursive flag must be set" FORCE)
endif()
elseif(_qrecurFlag)
string(REGEX MATCH "-qrecur" output_test <string> "${CMAKE_Fortran_FLAGS}")
if(NOT output_test)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur"
CACHE STRING "Recursive flag must be set" FORCE)
endif()
endif()
if(UNIX)
if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
endif()
if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict")
endif()
# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}")
endif()
if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
if(WIN32)
if(CMAKE_GENERATOR STREQUAL "NMake Makefiles")
get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE)
message(STATUS "Using Compaq Fortran compiler with command name ${CMAKE_Fortran_COMPILER_CMDNAM}")
set(cmd ${CMAKE_Fortran_COMPILER_CMDNAM})
string(TOLOWER "${cmd}" cmdlc)
if(cmdlc STREQUAL "df")
message(STATUS "Assume the Compaq Visual Fortran Compiler is being used")
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_INCLUDES 1)
#This is a workaround that is needed to avoid forward-slashes in the
#filenames listed in response files from incorrectly being interpreted as
#introducing compiler command options
if(${BUILD_SHARED_LIBS})
message(FATAL_ERROR "Making of shared libraries with CVF has not been tested.")
endif()
set(str "NMake version 9 or later should be used. NMake version 6.0 which is\n")
set(str "${str} included with the CVF distribution fails to build Lapack because\n")
set(str "${str} the number of source files exceeds the limit for NMake v6.0\n")
message(STATUS ${str})
set(CMAKE_Fortran_LINK_EXECUTABLE "LINK /out:<TARGET> <LINK_FLAGS> <LINK_LIBRARIES> <OBJECTS>")
endif()
endif()
endif()
endif()
# Add option to enable flat namespace for symbol resolution on macOS
if(APPLE)
option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF)
@ -268,26 +194,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/bin)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
# --------------------------------------------------
# Check for any necessary platform specific compiler flags
include(CheckLAPACKCompilerFlags)
CheckLAPACKCompilerFlags()
# --------------------------------------------------
# Check second function
include(CheckTimeFunction)
set(TIME_FUNC NONE)
CHECK_TIME_FUNCTION(NONE TIME_FUNC)
CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
# deprecated LAPACK and LAPACKE routines
option(BUILD_DEPRECATED "Build deprecated routines" OFF)
message(STATUS "Build deprecated routines: ${BUILD_DEPRECATED}")
@ -380,6 +286,10 @@ endif()
# Check the usage of the user provided or automatically found LAPACK libraries
if(LAPACK_LIBRARIES)
include(CheckLanguage)
check_language(Fortran)
if(CMAKE_Fortran_COMPILER)
enable_language(Fortran)
include(CheckFortranFunctionExists)
set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
# Check if new routine of 3.4.0 is in LAPACK_LIBRARIES
@ -393,12 +303,38 @@ if(LAPACK_LIBRARIES)
message(ERROR "--> Or Correct your LAPACK_LIBRARIES entry ")
message(ERROR "--> Or Consider checking USE_OPTIMIZED_LAPACK")
endif()
else()
message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.")
message(STATUS "--> CMake couldn't find a Fortran compiler, so it cannot check if the provided LAPACK library works.")
set(LATESTLAPACK_FOUND TRUE)
endif()
endif()
# Neither user specified or optimized LAPACK libraries can be used
if(NOT LATESTLAPACK_FOUND)
message(STATUS "Using supplied NETLIB LAPACK implementation")
set(LAPACK_LIBRARIES ${LAPACKLIB})
enable_language(Fortran)
# Check for any necessary platform specific compiler flags
include(CheckLAPACKCompilerFlags)
CheckLAPACKCompilerFlags()
# Check second function
include(CheckTimeFunction)
set(TIME_FUNC NONE)
CHECK_TIME_FUNCTION(NONE TIME_FUNC)
CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
# Set second function
message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
add_subdirectory(SRC)
else()
set(CMAKE_EXE_LINKER_FLAGS
@ -431,9 +367,11 @@ endif()
# Cache export target
set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME})
if(BUILD_TESTING OR LAPACKE_WITH_TMG)
enable_language(Fortran)
if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG)
set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
# Check if dlatms (part of tmg) is found
include(CheckFortranFunctionExists)
CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND)
unset(CMAKE_REQUIRED_LIBRARIES)
if(NOT LAPACK_WITH_TMGLIB_FOUND)
@ -448,6 +386,12 @@ endif()
set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE})
unset(LAPACK_INSTALL_EXPORT_NAME_CACHE)
#-------------------------------------
# LAPACKE
# Include lapack.h and lapacke_mangling.h even if LAPACKE is not built
add_subdirectory(LAPACKE/include)
if(LAPACKE)
add_subdirectory(LAPACKE)
endif()
@ -474,8 +418,8 @@ if (BLAS++)
ExternalProject_Add(blaspp
URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp
BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
)
ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES})
endif()
@ -487,16 +431,16 @@ if (LAPACK++)
ExternalProject_Add(lapackpp
URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
)
else ()
# FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files
ExternalProject_Add(lapackpp
URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
)
endif()
ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
@ -671,22 +615,34 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage")
set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION})
set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS)
set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
set(DOXYGEN_PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES)
set(DOXYGEN_SOURCE_BROWSER YES)
set(DISTRIBUTE_GROUP_DOC YES)
set(DOXYGEN_CREATE_SUBDIRS YES)
set(DOXYGEN_SEPARATE_MEMBER_PAGES YES)
set(DOXYGEN_EXTRACT_ALL YES)
set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h")
set(DOXYGEN_FILE_PATTERNS *.f *.f90 *.c *.h )
set(DOXYGEN_RECURSIVE YES)
set(DOXYGEN_GENERATE_TREEVIEW YES)
set(DOXYGEN_DOT_IMAGE_FORMAT svg)
set(DOXYGEN_INTERACTIVE_SVG YES)
set(DOXYGEN_QUIET YES)
set(DOXYGEN_WARNINGS NO)
set(DOXYGEN_QUIET NO)
set(DOXYGEN_WARNINGS YES)
set(DOXYGEN_WARN_NO_PARAMDOC YES)
set(DOXYGEN_WARN_LOGFILE doxygen_error)
set(DOXYGEN_GENERATE_HTML NO)
set(DOXYGEN_GENERATE_MAN NO)
set(DOXYGEN_LAYOUT_FILE "DOCS/DoxygenLayout.xml")
# Exclude functions that are duplicated, creating conflicts.
set(DOXYGEN_EXCLUDE .git
.github
SRC/VARIANTS
BLAS/SRC/lsame.f
BLAS/SRC/xerbla.f
BLAS/SRC/xerbla_array.f
INSTALL/slamchf77.f
INSTALL/dlamchf77.f )
if (BUILD_HTML_DOCUMENTATION)
set(DOXYGEN_GENERATE_HTML YES)
@ -697,13 +653,21 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
doxygen_add_docs(
html
${PROJECT_SOURCE_DIR}
# Doxygen INPUT =
${PROJECT_SOURCE_DIR}/README.md
${PROJECT_SOURCE_DIR}/BLAS
${PROJECT_SOURCE_DIR}/CBLAS
${PROJECT_SOURCE_DIR}/SRC
${PROJECT_SOURCE_DIR}/INSTALL
${PROJECT_SOURCE_DIR}/TESTING
${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)"
)
endif()
if (BUILD_MAN_DOCUMENTATION)
set(DOXYGEN_GENERATE_MAN YES)
set(DOXYGEN_EXCLUDE SRC/VARIANTS)
set(DOXYGEN_MAN_LINKS YES)
set(DOXYGEN_INLINE_SOURCES NO)
set(DOXYGEN_CALL_GRAPH NO)
@ -711,7 +675,15 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
doxygen_add_docs(
man
${PROJECT_SOURCE_DIR}
# Doxygen INPUT =
${PROJECT_SOURCE_DIR}/BLAS
${PROJECT_SOURCE_DIR}/CBLAS
${PROJECT_SOURCE_DIR}/SRC
${PROJECT_SOURCE_DIR}/INSTALL
${PROJECT_SOURCE_DIR}/TESTING
${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
COMMENT "Generating man LAPACK documentation"
)
endif()

View File

@ -1,4 +1,4 @@
# Doxyfile 1.8.10
# Doxyfile 1.9.1
# This file describes the settings to be used by the documentation system
# doxygen (www.doxygen.org) for a project.
@ -17,11 +17,11 @@
# Project related configuration options
#---------------------------------------------------------------------------
# This tag specifies the encoding used for all characters in the config file
# that follow. The default is UTF-8 which is also the encoding used for all text
# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
# for the list of possible encodings.
# This tag specifies the encoding used for all characters in the configuration
# file that follow. The default is UTF-8 which is also the encoding used for all
# text before the first occurrence of this tag. Doxygen uses libiconv (or the
# iconv built into libc) for the transcoding. See
# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
# The default value is: UTF-8.
DOXYFILE_ENCODING = UTF-8
@ -38,7 +38,7 @@ PROJECT_NAME = LAPACK
# could be handy for archiving the generated documentation or if some version
# control system is used.
PROJECT_NUMBER = 3.9.0
PROJECT_NUMBER = 3.12.0
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES = NO
OUTPUT_LANGUAGE = English
# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
# documentation generated by doxygen is written. Doxygen will use this
# information to generate all generated output in the proper direction.
# Possible values are: None, LTR, RTL and Context.
# The default value is: None.
OUTPUT_TEXT_DIRECTION = None
# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
# descriptions after the members that are listed in the file and class
# documentation (similar to Javadoc). Set to NO to disable this.
@ -179,6 +187,16 @@ SHORT_NAMES = NO
JAVADOC_AUTOBRIEF = NO
# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
# such as
# /***************
# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
# Javadoc-style will behave just like regular comments and it will not be
# interpreted by doxygen.
# The default value is: NO.
JAVADOC_BANNER = NO
# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
# line (until the first dot) of a Qt-style comment as the brief description. If
# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@ -199,6 +217,14 @@ QT_AUTOBRIEF = NO
MULTILINE_CPP_IS_BRIEF = NO
# By default Python docstrings are displayed as preformatted text and doxygen's
# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
# doxygen's special commands can be used and the contents of the docstring
# documentation blocks is shown as doxygen documentation.
# The default value is: YES.
PYTHON_DOCSTRING = YES
# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
# documentation from any documented member that it re-implements.
# The default value is: YES.
@ -226,16 +252,15 @@ TAB_SIZE = 8
# will allow you to put the command \sideeffect (or @sideeffect) in the
# documentation, which will result in a user-defined paragraph with heading
# "Side Effects:". You can put \n's in the value part of an alias to insert
# newlines.
# newlines (in the resulting output). You can put ^^ in the value part of an
# alias to insert a newline as if a physical newline was in the original file.
# When you need a literal { or } or , in the value part of an alias you have to
# escape them by means of a backslash (\), this can lead to conflicts with the
# commands \{ and \} for these it is advised to use the version @{ and @} or use
# a double escape (\\{ and \\})
ALIASES =
# This tag can be used to specify a number of word-keyword mappings (TCL only).
# A mapping has the form "name=value". For example adding "class=itcl::class"
# will allow you to use the command class in the itcl::class meaning.
TCL_SUBST =
# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
# only. Doxygen will then generate output that is more tailored for C. For
# instance, some of the names that are used will be different. The list of all
@ -264,28 +289,40 @@ OPTIMIZE_FOR_FORTRAN = YES
OPTIMIZE_OUTPUT_VHDL = NO
# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
# sources only. Doxygen will then generate output that is more tailored for that
# language. For instance, namespaces will be presented as modules, types will be
# separated into more groups, etc.
# The default value is: NO.
OPTIMIZE_OUTPUT_SLICE = NO
# Doxygen selects the parser to use depending on the extension of the files it
# parses. With this tag you can assign which parser to use for a given
# extension. Doxygen has a built-in mapping, but you can override or extend it
# using this tag. The format is ext=language, where ext is a file extension, and
# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
# Fortran. In the later case the parser tries to guess whether the code is fixed
# or free formatted code, this is the default for Fortran type files), VHDL. For
# instance to make doxygen treat .inc files as Fortran files (default is PHP),
# and .f files as C (default is Fortran), use: inc=Fortran f=C.
# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
# tries to guess whether the code is fixed or free formatted code, this is the
# default for Fortran type files). For instance to make doxygen treat .inc files
# as Fortran files (default is PHP), and .f files as C (default is Fortran),
# use: inc=Fortran f=C.
#
# Note: For files without extension you can use no_extension as a placeholder.
#
# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
# the files are not read by doxygen.
# the files are not read by doxygen. When specifying no_extension you should add
# * to the FILE_PATTERNS.
#
# Note see also the list of default file extension mappings.
EXTENSION_MAPPING =
# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
# according to the Markdown format, which allows for more readable
# documentation. See http://daringfireball.net/projects/markdown/ for details.
# documentation. See https://daringfireball.net/projects/markdown/ for details.
# The output of markdown processing is further processed by doxygen, so you can
# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
# case of backward compatibilities issues.
@ -293,6 +330,15 @@ EXTENSION_MAPPING =
MARKDOWN_SUPPORT = YES
# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
# to that level are automatically included in the table of contents, even if
# they do not have an id attribute.
# Note: This feature currently applies only to Markdown headings.
# Minimum value: 0, maximum value: 99, default value: 5.
# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
TOC_INCLUDE_HEADINGS = 5
# When enabled doxygen tries to link words that correspond to documented
# classes, or namespaces to their corresponding documentation. Such a link can
# be prevented in individual cases by putting a % sign in front of the word or
@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT = NO
CPP_CLI_SUPPORT = NO
# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
# will parse them like normal C++ but will assume all classes use public instead
# of private inheritance when no explicit protection keyword is present.
# The default value is: NO.
@ -341,7 +387,7 @@ IDL_PROPERTY_SUPPORT = YES
# all members of a group must be documented explicitly.
# The default value is: NO.
DISTRIBUTE_GROUP_DOC = YES
DISTRIBUTE_GROUP_DOC = NO
# If one adds a struct or class to a group and this option is enabled, then also
# any nested class or struct is added to the same group. By default this option
@ -404,6 +450,19 @@ TYPEDEF_HIDES_STRUCT = NO
LOOKUP_CACHE_SIZE = 0
# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
# during processing. When set to 0 doxygen will based this on the number of
# cores available in the system. You can set it explicitly to a value larger
# than 0 to get more control over the balance between CPU load and processing
# speed. At this moment only the input processing can be done using multiple
# threads. Since this is still an experimental feature the default is set to 1,
# which efficively disables parallel processing. Please report any issues you
# encounter. Generating dot graphs in parallel is controlled by the
# DOT_NUM_THREADS setting.
# Minimum value: 0, maximum value: 32, default value: 1.
NUM_PROC_THREADS = 1
#---------------------------------------------------------------------------
# Build related configuration options
#---------------------------------------------------------------------------
@ -424,6 +483,12 @@ EXTRACT_ALL = YES
EXTRACT_PRIVATE = NO
# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
# methods of a class will be included in the documentation.
# The default value is: NO.
EXTRACT_PRIV_VIRTUAL = NO
# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
# scope will be included in the documentation.
# The default value is: NO.
@ -461,6 +526,13 @@ EXTRACT_LOCAL_METHODS = NO
EXTRACT_ANON_NSPACES = NO
# If this flag is set to YES, the name of an unnamed parameter in a declaration
# will be determined by the corresponding definition. By default unnamed
# parameters remain unnamed in the output.
# The default value is: YES.
RESOLVE_UNNAMED_PARAMS = YES
# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
# undocumented members inside documented classes or files. If set to NO these
# members will be included in the various overviews, but no documentation
@ -478,8 +550,8 @@ HIDE_UNDOC_MEMBERS = NO
HIDE_UNDOC_CLASSES = NO
# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
# (class|struct|union) declarations. If set to NO, these declarations will be
# included in the documentation.
# declarations. If set to NO, these declarations will be included in the
# documentation.
# The default value is: NO.
HIDE_FRIEND_COMPOUNDS = NO
@ -498,11 +570,18 @@ HIDE_IN_BODY_DOCS = NO
INTERNAL_DOCS = NO
# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
# names in lower-case letters. If set to YES, upper-case letters are also
# allowed. This is useful if you have classes or files whose names only differ
# in case and if your file system supports case sensitive file names. Windows
# and Mac users are advised to set this option to NO.
# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
# able to match the capabilities of the underlying filesystem. In case the
# filesystem is case sensitive (i.e. it supports files in the same directory
# whose names only differ in casing), the option must be set to YES to properly
# deal with such files in case they appear in the input. For filesystems that
# are not case sensitive the option should be be set to NO to properly deal with
# output files written for symbols that only differ in casing, such as for two
# classes, one named CLASS and the other named Class, and to also support
# references to files without having to specify the exact matching casing. On
# Windows (including Cygwin) and MacOS, users should typically set this option
# to NO, whereas on Linux or other Unix flavors it should typically be set to
# YES.
# The default value is: system dependent.
CASE_SENSE_NAMES = NO
@ -684,12 +763,12 @@ FILE_VERSION_FILTER =
# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
# tag is left empty.
LAYOUT_FILE =
LAYOUT_FILE = DOCS/DoxygenLayout.xml
# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
# the reference definitions. This must be a list of .bib files. The .bib
# extension is automatically appended if omitted. This requires the bibtex tool
# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
# For LaTeX the style of the bibliography can be controlled using
# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
# search path. See also \cite for info how to create references.
@ -705,7 +784,7 @@ CITE_BIB_FILES =
# messages are off.
# The default value is: NO.
QUIET = YES
QUIET = NO
# The WARNINGS tag can be used to turn on/off the warning messages that are
# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
@ -734,10 +813,20 @@ WARN_IF_DOC_ERROR = YES
# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
# are documented, but have no documentation for their parameters or return
# value. If set to NO, doxygen will only warn about wrong or incomplete
# parameter documentation, but not about the absence of documentation.
# parameter documentation, but not about the absence of documentation. If
# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
# The default value is: NO.
WARN_NO_PARAMDOC = NO
WARN_NO_PARAMDOC = YES
# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
# at the end of the doxygen process doxygen will return with a non-zero status.
# Possible values are: NO, YES and FAIL_ON_WARNINGS.
# The default value is: NO.
WARN_AS_ERROR = NO
# The WARN_FORMAT tag determines the format of the warning messages that doxygen
# can produce. The string should contain the $file, $line, and $text tags, which
@ -753,7 +842,7 @@ WARN_FORMAT = "$file:$line: $text"
# messages should be written. If left blank the output is written to standard
# error (stderr).
WARN_LOGFILE = output_err
WARN_LOGFILE = doxygen_error
#---------------------------------------------------------------------------
# Configuration options related to the input files
@ -762,17 +851,18 @@ WARN_LOGFILE = output_err
# The INPUT tag is used to specify the files and/or directories that contain
# documented source files. You may enter file names like myfile.cpp or
# directories like /usr/src/myproject. Separate the files or directories with
# spaces.
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
INPUT = . \
DOCS/groups-usr.dox
INPUT = BLAS CBLAS SRC INSTALL TESTING \
DOCS/groups-usr.dox \
README.md
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
# documentation (see: http://www.gnu.org/software/libiconv) for the list of
# possible encodings.
# documentation (see:
# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
# The default value is: UTF-8.
INPUT_ENCODING = UTF-8
@ -785,14 +875,19 @@ INPUT_ENCODING = UTF-8
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# read by doxygen.
#
# Note the list of default checked file patterns might differ from the list of
# default file extension mappings.
#
# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
# *.vhdl, *.ucf, *.qsf, *.as and *.js.
# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
# *.ucf, *.qsf and *.ice.
FILE_PATTERNS = *.c \
*.f \
*.f90 \
*.h
# The RECURSIVE tag can be used to specify whether or not subdirectories should
@ -808,34 +903,15 @@ RECURSIVE = YES
# Note that relative paths are relative to the directory from which doxygen is
# run.
EXCLUDE = CMAKE \
DOCS \
.svn \
CBLAS/.svn \
CBLAS/src/.svn \
CBLAS/testing/.svn \
CBLAS/example/.svn \
CBLAS/include/.svn \
BLAS/.svn \
BLAS/SRC/.svn \
BLAS/TESTING/.svn \
SRC/.svn \
SRC/VARIANTS/.svn \
SRC/VARIANTS/LIB/.svn \
SRC/VARIANTS/cholesky/.svn \
SRC/VARIANTS/cholesky/RL/.svn \
SRC/VARIANTS/cholesky/TOP/.svn \
SRC/VARIANTS/lu/.svn \
SRC/VARIANTS/lu/CR/.svn \
SRC/VARIANTS/lu/LL/.svn \
SRC/VARIANTS/lu/REC/.svn \
SRC/VARIANTS/qr/.svn \
SRC/VARIANTS/qr/LL/.svn \
INSTALL/.svn \
TESTING/.svn \
TESTING/EIG/.svn \
TESTING/MATGEN/.svn \
TESTING/LIN/.svn
# Exclude functions that are duplicated, creating conflicts.
EXCLUDE = .git \
.github \
SRC/VARIANTS \
BLAS/SRC/lsame.f \
BLAS/SRC/xerbla.f \
BLAS/SRC/xerbla_array.f \
INSTALL/slamchf77.f \
INSTALL/dlamchf77.f \
# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
# directories that are symbolic links (a Unix file system feature) are excluded
@ -908,6 +984,10 @@ IMAGE_PATH =
# Note that the filter must not add or remove lines; it is applied before the
# code is scanned, but not when the output code is generated. If lines are added
# or removed, the anchors will not be placed correctly.
#
# Note that for custom extensions or not directly supported extensions you also
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# properly processed by doxygen.
INPUT_FILTER =
@ -917,6 +997,10 @@ INPUT_FILTER =
# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
# patterns match the file name, INPUT_FILTER is applied.
#
# Note that for custom extensions or not directly supported extensions you also
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# properly processed by doxygen.
FILTER_PATTERNS =
@ -969,7 +1053,7 @@ INLINE_SOURCES = YES
STRIP_CODE_COMMENTS = YES
# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
# function all documented functions referencing it will be listed.
# entity all documented functions referencing it will be listed.
# The default value is: NO.
REFERENCED_BY_RELATION = NO
@ -1001,12 +1085,12 @@ SOURCE_TOOLTIPS = YES
# If the USE_HTAGS tag is set to YES then the references to source code will
# point to the HTML generated by the htags(1) tool instead of doxygen built-in
# source browser. The htags tool is part of GNU's global source tagging system
# (see http://www.gnu.org/software/global/global.html). You will need version
# (see https://www.gnu.org/software/global/global.html). You will need version
# 4.8.6 or higher.
#
# To use it do the following:
# - Install the latest version of global
# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
# - Make sure the INPUT points to the root of the source tree
# - Run doxygen as normal
#
@ -1028,25 +1112,6 @@ USE_HTAGS = NO
VERBATIM_HEADERS = YES
# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
# cost of reduced performance. This can be particularly helpful with template
# rich C++ code for which doxygen's built-in parser lacks the necessary type
# information.
# Note: The availability of this option depends on whether or not doxygen was
# compiled with the --with-libclang option.
# The default value is: NO.
CLANG_ASSISTED_PARSING = NO
# If clang assisted parsing is enabled you can provide the compiler with command
# line options that you would normally use when invoking the compiler. Note that
# the include paths will already be set by doxygen for the files and directories
# specified with INPUT and INCLUDE_PATH.
# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
CLANG_OPTIONS =
#---------------------------------------------------------------------------
# Configuration options related to the alphabetical class index
#---------------------------------------------------------------------------
@ -1058,13 +1123,6 @@ CLANG_OPTIONS =
ALPHABETICAL_INDEX = YES
# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
# which the alphabetical index list will be split.
# Minimum value: 1, maximum value: 20, default value: 5.
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
COLS_IN_ALPHA_INDEX = 5
# In case all classes in a project start with a common prefix, all classes will
# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
# can be used to specify a prefix (or a list of prefixes) that should be ignored
@ -1165,7 +1223,7 @@ HTML_EXTRA_FILES =
# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
# will adjust the colors in the style sheet and background images according to
# this color. Hue is specified as an angle on a colorwheel, see
# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
# purple, and 360 is red again.
# Minimum value: 0, maximum value: 359, default value: 220.
@ -1201,6 +1259,17 @@ HTML_COLORSTYLE_GAMMA = 80
HTML_TIMESTAMP = YES
# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
# documentation will contain a main index with vertical navigation menus that
# are dynamically created via JavaScript. If disabled, the navigation index will
# consists of multiple levels of tabs that are statically embedded in every HTML
# page. Disable this option to support browsers that do not have JavaScript,
# like the Qt help browser.
# The default value is: YES.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_DYNAMIC_MENUS = YES
# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
# documentation will contain sections that can be hidden and shown after the
# page has loaded.
@ -1224,13 +1293,14 @@ HTML_INDEX_NUM_ENTRIES = 100
# If the GENERATE_DOCSET tag is set to YES, additional index files will be
# generated that can be used as input for Apple's Xcode 3 integrated development
# environment (see: http://developer.apple.com/tools/xcode/), introduced with
# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
# Makefile in the HTML output directory. Running make will produce the docset in
# that directory and running make install will install the docset in
# environment (see:
# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
# create a documentation set, doxygen will generate a Makefile in the HTML
# output directory. Running make will produce the docset in that directory and
# running make install will install the docset in
# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
# for more information.
# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
# genXcode/_index.html for more information.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.
@ -1269,8 +1339,8 @@ DOCSET_PUBLISHER_NAME = Publisher
# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
# Windows.
# (see:
# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
#
# The HTML Help Workshop contains a compiler that can convert all HTML output
# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@ -1300,7 +1370,7 @@ CHM_FILE =
HHC_LOCATION =
# The GENERATE_CHI flag controls if a separate .chi index file is generated
# (YES) or that it should be included in the master .chm file (NO).
# (YES) or that it should be included in the main .chm file (NO).
# The default value is: NO.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
@ -1345,7 +1415,8 @@ QCH_FILE =
# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
# Project output. For more information please see Qt Help Project / Namespace
# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
# (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
# The default value is: org.doxygen.Project.
# This tag requires that the tag GENERATE_QHP is set to YES.
@ -1353,8 +1424,8 @@ QHP_NAMESPACE = org.doxygen.Project
# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
# Help Project output. For more information please see Qt Help Project / Virtual
# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
# folders).
# Folders (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
# The default value is: doc.
# This tag requires that the tag GENERATE_QHP is set to YES.
@ -1362,30 +1433,30 @@ QHP_VIRTUAL_FOLDER = doc
# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
# filter to add. For more information please see Qt Help Project / Custom
# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
# filters).
# Filters (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_NAME =
# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
# custom filter to add. For more information please see Qt Help Project / Custom
# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
# filters).
# Filters (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_ATTRS =
# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
# project's filter section matches. Qt Help Project / Filter Attributes (see:
# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_SECT_FILTER_ATTRS =
# The QHG_LOCATION tag can be used to specify the location of Qt's
# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
# generated .qhp file.
# The QHG_LOCATION tag can be used to specify the location (absolute path
# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
# run qhelpgenerator on the generated .qhp file.
# This tag requires that the tag GENERATE_QHP is set to YES.
QHG_LOCATION =
@ -1462,6 +1533,17 @@ TREEVIEW_WIDTH = 250
EXT_LINKS_IN_WINDOW = NO
# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
# the HTML output. These images will generally look nicer at scaled resolutions.
# Possible values are: png (the default) and svg (looks nicer but requires the
# pdf2svg or inkscape tool).
# The default value is: png.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_FORMULA_FORMAT = png
# Use this tag to change the font size of LaTeX formulas included as images in
# the HTML documentation. When you change the font size after a successful
# doxygen run you need to manually remove any form_*.png images from the HTML
@ -1471,7 +1553,7 @@ EXT_LINKS_IN_WINDOW = NO
FORMULA_FONTSIZE = 10
# Use the FORMULA_TRANPARENT tag to determine whether or not the images
# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
# generated for formulas are transparent PNGs. Transparent PNGs are not
# supported properly for IE 6.0, but are supported on all modern browsers.
#
@ -1482,8 +1564,14 @@ FORMULA_FONTSIZE = 10
FORMULA_TRANSPARENT = YES
# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
# to create new LaTeX commands to be used in formulas as building blocks. See
# the section "Including formulas" for details.
FORMULA_MACROFILE =
# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
# http://www.mathjax.org) which uses client side Javascript for the rendering
# https://www.mathjax.org) which uses client side JavaScript for the rendering
# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
# installed or if you want to formulas look prettier in the HTML output. When
# enabled you may also need to install MathJax separately and configure the path
@ -1495,7 +1583,7 @@ USE_MATHJAX = NO
# When MathJax is enabled you can set the default output format to be used for
# the MathJax output. See the MathJax site (see:
# http://docs.mathjax.org/en/latest/output.html) for more details.
# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
# Possible values are: HTML-CSS (which is slower, but has the best
# compatibility), NativeMML (i.e. MathML) and SVG.
# The default value is: HTML-CSS.
@ -1510,8 +1598,8 @@ MATHJAX_FORMAT = HTML-CSS
# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
# Content Delivery Network so you can quickly see the result without installing
# MathJax. However, it is strongly recommended to install a local copy of
# MathJax from http://www.mathjax.org before deployment.
# The default value is: http://cdn.mathjax.org/mathjax/latest.
# MathJax from https://www.mathjax.org before deployment.
# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
# This tag requires that the tag USE_MATHJAX is set to YES.
MATHJAX_RELPATH = http://www.mathjax.org/mathjax
@ -1525,7 +1613,8 @@ MATHJAX_EXTENSIONS =
# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
# of code that will be used on startup of the MathJax code. See the MathJax site
# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
# (see:
# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
# example see the documentation.
# This tag requires that the tag USE_MATHJAX is set to YES.
@ -1553,7 +1642,7 @@ MATHJAX_CODEFILE =
SEARCHENGINE = YES
# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
# implemented using a web server instead of a web client using Javascript. There
# implemented using a web server instead of a web client using JavaScript. There
# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
# setting. When disabled, doxygen will generate a PHP script for searching and
# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@ -1572,7 +1661,8 @@ SERVER_BASED_SEARCH = NO
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
# Xapian (see: http://xapian.org/).
# Xapian (see:
# https://xapian.org/).
#
# See the section "External Indexing and Searching" for details.
# The default value is: NO.
@ -1585,8 +1675,9 @@ EXTERNAL_SEARCH = NO
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
# Xapian (see: http://xapian.org/). See the section "External Indexing and
# Searching" for details.
# Xapian (see:
# https://xapian.org/). See the section "External Indexing and Searching" for
# details.
# This tag requires that the tag SEARCHENGINE is set to YES.
SEARCHENGINE_URL =
@ -1637,21 +1728,35 @@ LATEX_OUTPUT = latex
# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
# invoked.
#
# Note that when enabling USE_PDFLATEX this option is only used for generating
# bitmaps for formulas in the HTML output, but not in the Makefile that is
# written to the output directory.
# The default file is: latex.
# Note that when not enabling USE_PDFLATEX the default is latex when enabling
# USE_PDFLATEX the default is pdflatex and when in the later case latex is
# chosen this is overwritten by pdflatex. For specific output languages the
# default can have been set differently, this depends on the implementation of
# the output language.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_CMD_NAME = latex
# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
# index for LaTeX.
# Note: This tag is used in the Makefile / make.bat.
# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
# (.tex).
# The default file is: makeindex.
# This tag requires that the tag GENERATE_LATEX is set to YES.
MAKEINDEX_CMD_NAME = makeindex
# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
# generate index for LaTeX. In case there is no backslash (\) as first character
# it will be automatically added in the LaTeX code.
# Note: This tag is used in the generated output file (.tex).
# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
# The default value is: makeindex.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_MAKEINDEX_CMD = makeindex
# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
# documents. This may be useful for small projects and may help to save some
# trees in general.
@ -1736,9 +1841,11 @@ LATEX_EXTRA_FILES =
PDF_HYPERLINKS = YES
# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
# the PDF file directly from the LaTeX files. Set this option to YES, to get a
# higher quality PDF documentation.
# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
# files. Set this option to YES, to get a higher quality PDF documentation.
#
# See also section LATEX_CMD_NAME for selecting the engine.
# The default value is: YES.
# This tag requires that the tag GENERATE_LATEX is set to YES.
@ -1772,12 +1879,28 @@ LATEX_SOURCE_CODE = NO
# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
# bibliography, e.g. plainnat, or ieeetr. See
# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
# The default value is: plain.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_BIB_STYLE = plain
# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
# page will contain the date and time when the page was generated. Setting this
# to NO can help when comparing the output of multiple runs.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_TIMESTAMP = NO
# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
# path from which the emoji images will be read. If a relative path is entered,
# it will be relative to the LATEX_OUTPUT directory. If left blank the
# LATEX_OUTPUT directory will be used.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_EMOJI_DIRECTORY =
#---------------------------------------------------------------------------
# Configuration options related to the RTF output
#---------------------------------------------------------------------------
@ -1817,9 +1940,9 @@ COMPACT_RTF = NO
RTF_HYPERLINKS = YES
# Load stylesheet definitions from file. Syntax is similar to doxygen's config
# file, i.e. a series of assignments. You only have to provide replacements,
# missing definitions are set to their default value.
# Load stylesheet definitions from file. Syntax is similar to doxygen's
# configuration file, i.e. a series of assignments. You only have to provide
# replacements, missing definitions are set to their default value.
#
# See also section "Doxygen usage" for information on how to generate the
# default style sheet that doxygen normally uses.
@ -1828,8 +1951,8 @@ RTF_HYPERLINKS = YES
RTF_STYLESHEET_FILE =
# Set optional variables used in the generation of an RTF document. Syntax is
# similar to doxygen's config file. A template extensions file can be generated
# using doxygen -e rtf extensionFile.
# similar to doxygen's configuration file. A template extensions file can be
# generated using doxygen -e rtf extensionFile.
# This tag requires that the tag GENERATE_RTF is set to YES.
RTF_EXTENSIONS_FILE =
@ -1915,6 +2038,13 @@ XML_OUTPUT = xml
XML_PROGRAMLISTING = YES
# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
# namespace members in file scope as well, matching the HTML output.
# The default value is: NO.
# This tag requires that the tag GENERATE_XML is set to YES.
XML_NS_MEMB_FILE_SCOPE = NO
#---------------------------------------------------------------------------
# Configuration options related to the DOCBOOK output
#---------------------------------------------------------------------------
@ -1947,9 +2077,9 @@ DOCBOOK_PROGRAMLISTING = NO
#---------------------------------------------------------------------------
# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
# AutoGen Definitions (see http://autogen.sf.net) file that captures the
# structure of the code including all documentation. Note that this feature is
# still experimental and incomplete at the moment.
# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
# the structure of the code including all documentation. Note that this feature
# is still experimental and incomplete at the moment.
# The default value is: NO.
GENERATE_AUTOGEN_DEF = NO
@ -2116,12 +2246,6 @@ EXTERNAL_GROUPS = YES
EXTERNAL_PAGES = YES
# The PERL_PATH should be the absolute path and name of the perl script
# interpreter (i.e. the result of 'which perl').
# The default file (with absolute path) is: /usr/bin/perl.
PERL_PATH = /sw/bin/perl
#---------------------------------------------------------------------------
# Configuration options related to the dot tool
#---------------------------------------------------------------------------
@ -2135,15 +2259,6 @@ PERL_PATH = /sw/bin/perl
CLASS_DIAGRAMS = YES
# You can define message sequence charts within doxygen comments using the \msc
# command. Doxygen will then run the mscgen tool (see:
# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
# documentation. The MSCGEN_PATH tag allows you to specify the directory where
# the mscgen tool resides. If left empty the tool is assumed to be found in the
# default search path.
MSCGEN_PATH =
# You can include diagrams made with dia in doxygen documentation. Doxygen will
# then run dia to produce the diagram and insert it in the documentation. The
# DIA_PATH tag allows you to specify the directory where the dia binary resides.
@ -2241,10 +2356,32 @@ UML_LOOK = NO
# but if the number exceeds 15, the total amount of fields shown is limited to
# 10.
# Minimum value: 0, maximum value: 100, default value: 10.
# This tag requires that the tag HAVE_DOT is set to YES.
# This tag requires that the tag UML_LOOK is set to YES.
UML_LIMIT_NUM_FIELDS = 10
# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
# tag is set to YES, doxygen will add type and arguments for attributes and
# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
# will not generate fields with class member information in the UML graphs. The
# class diagrams will look similar to the default class diagrams but using UML
# notation for the relationships.
# Possible values are: NO, YES and NONE.
# The default value is: NO.
# This tag requires that the tag UML_LOOK is set to YES.
DOT_UML_DETAILS = NO
# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
# to display on a single line. If the actual line length exceeds this threshold
# significantly it will wrapped across multiple lines. Some heuristics are apply
# to avoid ugly line breaks.
# Minimum value: 0, maximum value: 1000, default value: 17.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_WRAP_THRESHOLD = 17
# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
# collaboration graphs will show the relations between templates and their
# instances.
@ -2371,6 +2508,11 @@ DIAFILE_DIRS =
PLANTUML_JAR_PATH =
# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
# configuration file for plantuml.
PLANTUML_CFG_FILE =
# When using plantuml, the specified paths are searched for files specified by
# the !include statement in a plantuml block.
@ -2429,9 +2571,11 @@ DOT_MULTI_TARGETS = NO
GENERATE_LEGEND = YES
# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
# files that are used to generate the various graphs.
#
# Note: This setting is not only used for dot files but also for msc and
# plantuml temporary files.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_CLEANUP = YES

View File

@ -0,0 +1,197 @@
<doxygenlayout version="1.0">
<!-- Generated by doxygen 1.8.13 -->
<!-- Navigation index tabs for HTML output -->
<navindex>
<tab type="mainpage" visible="yes" title=""/>
<tab type="pages" visible="yes" title="" intro=""/>
<!-- LAPACK: change "Modules" to "Routines" -->
<tab type="modules" visible="yes" title="Routines" intro=""/>
<tab type="namespaces" visible="yes" title="">
<tab type="namespacelist" visible="yes" title="" intro=""/>
<tab type="namespacemembers" visible="yes" title="" intro=""/>
</tab>
<tab type="classes" visible="yes" title="">
<tab type="classlist" visible="yes" title="" intro=""/>
<tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
<tab type="hierarchy" visible="yes" title="" intro=""/>
<tab type="classmembers" visible="yes" title="" intro=""/>
</tab>
<tab type="files" visible="yes" title="">
<tab type="filelist" visible="yes" title="" intro=""/>
<tab type="globals" visible="yes" title="" intro=""/>
</tab>
<tab type="examples" visible="yes" title="" intro=""/>
</navindex>
<!-- Layout definition for a class page -->
<class>
<briefdescription visible="yes"/>
<includes visible="$SHOW_INCLUDE_FILES"/>
<inheritancegraph visible="$CLASS_GRAPH"/>
<collaborationgraph visible="$COLLABORATION_GRAPH"/>
<memberdecl>
<nestedclasses visible="yes" title=""/>
<publictypes title=""/>
<services title=""/>
<interfaces title=""/>
<publicslots title=""/>
<signals title=""/>
<publicmethods title=""/>
<publicstaticmethods title=""/>
<publicattributes title=""/>
<publicstaticattributes title=""/>
<protectedtypes title=""/>
<protectedslots title=""/>
<protectedmethods title=""/>
<protectedstaticmethods title=""/>
<protectedattributes title=""/>
<protectedstaticattributes title=""/>
<packagetypes title=""/>
<packagemethods title=""/>
<packagestaticmethods title=""/>
<packageattributes title=""/>
<packagestaticattributes title=""/>
<properties title=""/>
<events title=""/>
<privatetypes title=""/>
<privateslots title=""/>
<privatemethods title=""/>
<privatestaticmethods title=""/>
<privateattributes title=""/>
<privatestaticattributes title=""/>
<friends title=""/>
<related title="" subtitle=""/>
<membergroups visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
<memberdef>
<inlineclasses title=""/>
<typedefs title=""/>
<enums title=""/>
<services title=""/>
<interfaces title=""/>
<constructors title=""/>
<functions title=""/>
<related title=""/>
<variables title=""/>
<properties title=""/>
<events title=""/>
</memberdef>
<allmemberslink visible="yes"/>
<usedfiles visible="$SHOW_USED_FILES"/>
<authorsection visible="yes"/>
</class>
<!-- Layout definition for a namespace page -->
<namespace>
<briefdescription visible="yes"/>
<memberdecl>
<nestednamespaces visible="yes" title=""/>
<constantgroups visible="yes" title=""/>
<classes visible="yes" title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
<membergroups visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
<memberdef>
<inlineclasses title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
</memberdef>
<authorsection visible="yes"/>
</namespace>
<!-- Layout definition for a file page -->
<file>
<briefdescription visible="yes"/>
<includes visible="$SHOW_INCLUDE_FILES"/>
<includegraph visible="$INCLUDE_GRAPH"/>
<includedbygraph visible="$INCLUDED_BY_GRAPH"/>
<sourcelink visible="yes"/>
<memberdecl>
<classes visible="yes" title=""/>
<namespaces visible="yes" title=""/>
<constantgroups visible="yes" title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
<membergroups visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
<memberdef>
<inlineclasses title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
</memberdef>
<authorsection/>
</file>
<!-- Layout definition for a group page -->
<group>
<briefdescription visible="yes"/>
<groupgraph visible="$GROUP_GRAPHS"/>
<memberdecl>
<nestedgroups visible="yes" title=""/>
<dirs visible="yes" title=""/>
<files visible="yes" title=""/>
<namespaces visible="yes" title=""/>
<classes visible="yes" title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<enumvalues title=""/>
<functions title=""/>
<variables title=""/>
<signals title=""/>
<publicslots title=""/>
<protectedslots title=""/>
<privateslots title=""/>
<events title=""/>
<properties title=""/>
<friends title=""/>
<membergroups visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
<memberdef>
<pagedocs/>
<inlineclasses title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<enumvalues title=""/>
<functions title=""/>
<variables title=""/>
<signals title=""/>
<publicslots title=""/>
<protectedslots title=""/>
<privateslots title=""/>
<events title=""/>
<properties title=""/>
<friends title=""/>
</memberdef>
<authorsection visible="yes"/>
</group>
<!-- Layout definition for a directory page -->
<directory>
<briefdescription visible="yes"/>
<directorygraph visible="yes"/>
<memberdecl>
<dirs visible="yes"/>
<files visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
</directory>
</doxygenlayout>

File diff suppressed because it is too large Load Diff

View File

@ -315,7 +315,6 @@ typedef struct Namelist Namelist;
/* > \author Univ. of Colorado Denver */
/* > \author NAG Ltd. */
/* > \date November 2019 */
/* > \ingroup auxOTHERauxiliary */
@ -332,7 +331,7 @@ typedef struct Namelist Namelist;
/* ===================================================================== */
*vers_major__ = 3;
*vers_minor__ = 11;
*vers_minor__ = 12;
*vers_patch__ = 0;
/* ===================================================================== */

View File

@ -44,7 +44,6 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \date November 2019
*
*> \ingroup auxOTHERauxiliary
*
@ -60,7 +59,7 @@
INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
* =====================================================================
VERS_MAJOR = 3
VERS_MINOR = 11
VERS_MINOR = 12
VERS_PATCH = 0
* =====================================================================
*

Some files were not shown because too many files have changed in this diff Show More